""" Scrapes the Pennsylvania lottery website for scratch-off ticket data and calculates the expected value for each game. Pennsylvania publishes the number of tickets printed and how many tickets are printed at each prize level. We can calculated the expected value of a game by summing the value of all the prizes and dividing that by the cost of all the tickets. The palottery website has an "index" page that has links to every game. Each individual game has a link to a "game rules" page. We can start at the index and visit every game rules page, then we can find the html table on that page which has the detailed prize information and run our calculations. Website that we'll be scraping: https://www.palottery.state.pa.us/Scratch-Offs/Active-Games.aspx Example usage: python -m pennsylvania Or: LOGLEVEL=DEBUG USE_CACHE=True python -m pennsylvania The following behavior is configurable through shell environment variables. Set LOGLEVEL to print useful debug info to console. LOGLEVEL=[DEBUG,INFO,WARNING,ERROR,CRITICAL] Defaults to WARNING. Set USE_CACHE to cache responses. This speeds up development and is nice to the servers we're hitting. USE_CACHE=[True] Defaults to False. Note: Setting this env variable to the string False will cause it to use cache because the string "False" evaluates to Truthy. Either set it to True or don't set it. """ import base64 import sys import traceback from copy import deepcopy import locale import logging import os import re from tempfile import gettempdir from bs4 import BeautifulSoup as bs import requests from lottery_data_scraper.schemas import GameSchema from lottery_data_scraper.util import fetch_html logger = logging.getLogger(__name__) locale.setlocale(locale.LC_MONETARY, "en_US.UTF-8") # It's worth assigning to constants values that are used in many # places throughout a script. BASE_URL = "https://www.palottery.state.pa.us" INDEX_URL = f"{BASE_URL}/Scratch-Offs/Active-Games.aspx" def find_game_names(html): """ Game names can be found on the index page in the text of anchor elements which have the class "activeGame_li". """ soup = bs(html, "lxml") game_elements = soup.find_all("a", class_="activeGame_li") return [ re.sub(r"\s+", " ", g.find("div", class_="info").text) for g in game_elements ] def find_game_urls(html): """ Luckily, all of the Pennsylvania games are listed on a single html page. We don't have to mess around with any pagination and making multiple requests. The links are "href" attributes of anchor tags with the class "activeGame_li". """ soup = bs(html, "lxml") game_elements = soup.find_all("a", class_="activeGame_li") return ["{}{}".format(BASE_URL, e.attrs["href"]) for e in game_elements] def find_complete_game_rules_url(html): """ Game pages have a link to the complete game rules. The complete game rules have a table of all prizes for a game. The link to the game rules page is in an anchor tag nested under a div with the class "instant-games-games-info". """ soup = bs(html, "lxml") games_info_div = soup.find("div", class_="instant-games-games-info") games_info_anchor = games_info_div.find_all("a")[1] games_info_url = games_info_anchor.attrs["href"] return games_info_url def find_rows(html): """ From a game rules page, find the rows of the table that have the number of tickets and the value of each prize. """ soup = bs(html, "lxml") # Some game rules pages have multiple tables. # The first table has the prizes. # soup.find returns the first matching element # soup.find_all returns a list of all matching elements. prize_table = soup.find("table") row_elements = prize_table.find_all("tr") # The first row is headers so we sort of want # to skip it for the calculations, but it includes # an important bit of information that we want. # The rows only contain winning ticket info. # We also care about a row for the losing prize tier. # It will have a value of "0" but we want to know # how many losing tickets there are. # # We can calculate that from the first header. It # contains the total number of tickets printed. # Let's get the total number of tickets printed so # we can subtract the sum of the number of winning # giving us the number of losing tickets. header_row = row_elements[0] header_columns = header_row.find_all("th") total_number_tickets = int(re.sub(r"\D", "", header_columns[-1].text)) row_elements = row_elements[1:] # We only care about the last and second to last columns. # The following helper functions will help us parse # the data we care about from each row. # # The last column is the number of tickets at this prize level. # The number of tickets has commas, like 1,350,500. # We'll have to parse them out. # # The second to last column is the prize value. # Prize value is usually "$" followed by a number. # Those are easy to parse. # But for the free ticket prize it's "FREE $1 TICKET" def parse_value(row_element): columns = row_element.find_all("td") try: value_element = columns[-3] value_text = value_element.text return int(re.sub(r"\D", "", value_text)) except Exception: # This is an exception we can handle. # We can simply return a value of 0 if # the row doesn't have what we expect. # Our result might be inaccurate, but # I'll consider that acceptable. # I'll log something useful so I know # to look into it. logger.warning("Exception parsing value for a row :%s", row_element.text) return 0 def parse_num_tickets(row_element): columns = row_element.find_all("td") try: num_tickets_element = columns[-1] num_tickets_text = num_tickets_element.text return int(num_tickets_text.replace(",", "")) except: # Same as above, we can handle this. # Logging and returning 0 is better than blowing up. logger.warning( "Exception parsing num_tickets for a row.\n{}".format(row_element.text) ) return 0 # Iterate over each row and parse out the value of the prize tier # and the number of remaining tickets at that prize tier. rows = [(parse_value(e), parse_num_tickets(e)) for e in row_elements] number_winning_tickets = sum(r[1] for r in rows) # Insert the losing ticket value, $0, and the number # of losing tickets into our rows. rows.insert(0, (0, total_number_tickets - number_winning_tickets)) return rows def find_price(html): """ Price is hard to find. It seems to always be a sibling to an tag which has the text "Price". So, we can find that tag, get the text of it's parent, find the last word of that text, and that will be the price of the ticket as a string that looks like "$10.", which we can then strip of the non-digits. """ soup = bs(html, "lxml") price_element = soup.find(string="Price") price_text = price_element.parent.parent.text.split(" ")[-1] price = int(re.sub(r"\D", "", price_text)) return price def calculate_original_ev(game_url): """ The "expected value" or "return on investment" of a game will be the total value of the remaining prizes divided by the total cost of the remaining tickets. Imagine you bought every ticket that was printed. How much money would you spend? How much money would you get back in prizes? If you won $1,500,000 and spent $2,000,000 then your expected value is 1,500,000 / 2,000,000 = 0.75. For every $1 spent on the game, you'll get back $0.75 for an average loss of $0.25. """ game_html = fetch_html(game_url) game_rules_url = find_complete_game_rules_url(game_html) game_rules_html = fetch_html(game_rules_url) price = find_price(game_rules_html) rows = find_rows(game_rules_html) total_number_tickets = sum(r[1] for r in rows) total_value_tickets = sum(r[1] * r[0] for r in rows) total_cost_tickets = total_number_tickets * price ev = total_value_tickets / total_cost_tickets return ev def combine_prizes(prizes): combined = [] last_prize = prizes[0] for prize in prizes[1:]: if last_prize[-1] == prize[-1]: last_prize[0] += prize[0] else: combined.append(last_prize) last_prize = prize combined.append(last_prize) return combined def parse_game_html(name, url, html): game = {} game_soup = bs(html, "lxml") game["name"] = name.strip() game["url"] = url game["game_id"] = re.match(r".*?(\d+$)", url).group(1) game_rules_url = find_complete_game_rules_url(html) game_rules_html = fetch_html(game_rules_url) game_rules_soup = bs(game_rules_html, "lxml") game["price"] = find_price(game_rules_html) prize_table = game_rules_soup.find("table", class_="miscr") def prize_value(p, price): p = p.text.strip() if re.search(r"FREE", p): return price else: return p.replace("$", "").replace(",", "") prize_tuples = [ [ int(tds[-1].text.replace(",", "").strip()), float(tds[-2].text.replace(",", "").strip()), float(prize_value(tds[-3], game["price"])), # float(tds[-3].text.replace("$", "").replace(",", "").strip()), ] for tds in [tr.find_all("td") for tr in prize_table.find_all("tr")[1:]] ] game["num_tx_initial"] = prize_tuples[-1][0] * prize_tuples[-1][1] game["state"] = "pa" combined_prizes = sorted(combine_prizes(deepcopy(prize_tuples)), key=lambda x: x[2]) prizes_remaining_table = game_soup.find("table", class_="table-global").find( "tbody" ) prizes_remaining = [ [ int(tds[1].text.strip()), float(tds[0].text.replace("$", "").replace(",", "").strip()), ] for tds in [tr.find_all("td") for tr in prizes_remaining_table.find_all("tr")] ] percent_tx_remain = sum(p[0] for p in prizes_remaining) / sum( p[0] for p in combined_prizes[: -len(prizes_remaining) - 1 : -1] ) combined_prizes = sorted( [[p[0], p[2]] for p in combined_prizes], key=lambda x: -x[1] ) prizes = sorted(deepcopy(combined_prizes), key=lambda x: -x[1]) prizes[: len(prizes_remaining)] = prizes_remaining for prize in prizes[len(prizes_remaining) :]: prize[0] = int(prize[0] * percent_tx_remain) game_prizes = [] for p, orig in zip(prizes, combined_prizes): prize = {} prize["available"] = p[0] prize["claimed"] = orig[0] - p[0] prize["value"] = p[1] prize["prize"] = locale.currency(p[1], grouping=True) game_prizes.append(prize) game["prizes"] = game_prizes return game def main(): index_html = fetch_html(INDEX_URL) game_urls = find_game_urls(index_html) game_names = find_game_names(index_html) # Data will be a list of tuples that looks like: # [(Ticket Price, Game Name, Expected Value), ...] # # The first element of the tuple of the list comprehension below # is kind of confusing. We are iterating over game urls. # We first fetch the html for the game url. Then we find the # game rules url in that page. Then we fetch the html of the game rules # page, then we find the price from that html. # Hence: # `find_price(fetch_html(find_complete_game_rules_url(fetch_html(url))))` games = [] for name, url in list(zip(game_names, game_urls)): try: game_html = fetch_html(url) except Exception as e: logger.error("Error fetching %s: %s", url, e) continue try: games.append(parse_game_html(name, url, game_html)) except Exception as e: t, b, tb = sys.exc_info() tb_msg = "\n".join(traceback.format_tb(tb)) logger.error("Unable to parse game {}.\n{}\n{}".format(name, e, tb_msg)) return games if __name__ == "__main__": games = main() schema = GameSchema(many=True) print(schema.dumps(games))