From fdafaae2672a6587aa9b364becc54c29b47562e5 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Fri, 7 Apr 2023 23:32:33 -0700 Subject: [PATCH] Add parser for Pennsylvania --- .gitignore | 2 + CHANGELOG.md | 12 + Makefile | 15 ++ README.md | 101 +++++++ TODO.md | 12 + lottery_data_scraper/__init__.py | 40 +++ lottery_data_scraper/pennsylvania.py | 385 +++++++++++++++++++++++++++ lottery_data_scraper/schemas.py | 76 ++++++ setup.py | 28 ++ tests/test_pennsylvania.py | 23 ++ 10 files changed, 694 insertions(+) create mode 100644 .gitignore create mode 100644 CHANGELOG.md create mode 100644 Makefile create mode 100644 README.md create mode 100644 TODO.md create mode 100644 lottery_data_scraper/__init__.py create mode 100644 lottery_data_scraper/pennsylvania.py create mode 100644 lottery_data_scraper/schemas.py create mode 100644 setup.py create mode 100644 tests/test_pennsylvania.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..32aa08a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.egg-info/ +*.pyc diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..8f77428 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,12 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- Parser for Pennsylvania. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0e0e4e2 --- /dev/null +++ b/Makefile @@ -0,0 +1,15 @@ +## +# Lottery Data Scraper +# +# @file +# @version 0.1 + +FORCE: + +test: FORCE + python3 -m unittest discover tests + +style: FORCE + black . + +# end diff --git a/README.md b/README.md new file mode 100644 index 0000000..dcd8bc1 --- /dev/null +++ b/README.md @@ -0,0 +1,101 @@ +# Parsing of lottery websites + +## Demo + +The following script should put you in a state where the last line will make a +bunch of requests to the Pennsylvania lottery website, parse the tables of +games/prizes, and print to your terminal a JSON structure of all of the games. + +``` sh +git clone https://github.com/owogawc/lottery_data_scraper +cd lottery_data_scraper +python3 -m venv ~/.virtualenvs/lottery_data_scraper +. ~/.virtualenvs/lottery_data_scraper +pip3 install -e . + +PY_LOG_LVL=DEBUG USE_CACHE=true python3 -m lottery_data_scraper.pennsylvania +``` + +If you have [jq](https://stedolan.github.io/jq/) installed, you can get some +formatted output by piping it to `jq` (and redirecting STDERR to /dev/null). + +``` sh +PY_LOG_LVL=DEBUG USE_CACHE=true python3 -m lottery_data_scraper.pennsylvania 2> /dev/null | jq +``` + +## Data models + +We're using [`marshmallow`](https://marshmallow.readthedocs.io/en/stable/index.html) to validate and serialize data. + +I'm including the schemas here just so you can quickly get a general idea of +what data fields we're able to scrape from most lottery websites. What you see +in this README might not be up-to-date with what's in +[schemas.py](./lottery_data_scraper/schemas.py). + +As of 2023-04-07 the schemas are a work-in-progress. The remaining TODO is to +determine and specify which fields are absolutely required and which are +optional. + +### Game Schema + +``` python +class GameSchema(Schema): + class Meta: + render_module = json + + id = fields.Integer() + created_at = fields.DateTime(load_default=datetime.utcnow) + game_id = fields.Str() + name = fields.Str() + description = fields.Str() + image_urls = fields.Function( + lambda x: json.loads(x.image_urls) if x.image_urls else [], + deserialize=lambda x: None if x.image_urls == [] else json.dumps(x.image_urls), + ) + how_to_play = fields.Str() + num_tx_initial = fields.Integer() + price = fields.Number() + prizes = fields.Nested(PrizeSchema, many=True) + state = fields.Str() + updated_at = fields.DateTime() + url = fields.Str() +``` + +### Prize Schema + +``` python +class PrizeSchema(Schema): + class Meta: + render_module = json + + id = fields.Integer() + game_id = fields.Integer() + available = fields.Integer() + claimed = fields.Integer() + created_at = fields.DateTime(load_default=datetime.utcnow) + value = fields.Number() + prize = fields.Str() +``` + +# Tests + +Testing is kind of tricky because you can't rely on _just_ python with its +`requests` library. Some states have some scrape protections that require you +actually run JavaScript. Some states have extreme scrape protection that require +you to actually run a _display_. They check for some rendering context that +doesn't exist when you run a headless browser in Selenium. To scrape those +sites, you actually have to run a [X virtual +framebuffer](https://en.wikipedia.org/wiki/Xvfb). Testing in these cases isn't +as simple as running `python3 -m unittest discover`. + +# Contributing + +``` sh +git clone https://github.com/owogawc/lottery_data_scraper +cd lottery_data_scraper +python3 -m venv ~/.virtualenvs/lottery_data_scraper +. ~/.virtualenvs/lottery_data_scraper +pip3 install -e . +``` + +Then you should be able to run `make test` and see the tests pass. diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..1d23a60 --- /dev/null +++ b/TODO.md @@ -0,0 +1,12 @@ +# TODO + +- [ ] Add more states. + - [ ] California + - [ ] Georgia + - [ ] Louisiana + - [ ] Florida + - [ ] Texas + - [ ] New Mexico +- [ ] [Publish to PYPI](https://packaging.python.org/en/latest/tutorials/packaging-projects/). +- [ ] Do we still need Xvfb? Which states have that level of scrape protection? +- [ ] Decide on and add a license. diff --git a/lottery_data_scraper/__init__.py b/lottery_data_scraper/__init__.py new file mode 100644 index 0000000..a0e1b91 --- /dev/null +++ b/lottery_data_scraper/__init__.py @@ -0,0 +1,40 @@ +""" +Configure logging for the entire package. + +You can specify a log level with the environment variable +PY_LOG_LVL=[debug|info|warning|error|critical] +""" +import logging +import logging.config +import os + + +# Prefix the basic format with a timestamp, file pathname, and line number. +# See: https://docs.python.org/3/library/logging.html#logrecord-attributes +LOG_FORMAT = "%(asctime)s %(pathname)s %(lineno)s {}".format(logging.BASIC_FORMAT) + +log_level = getattr(logging, os.environ.get("PY_LOG_LVL", "WARNING").upper()) +logging_config = { + "version": 1, + "formatters": { + "standard": { + "format": LOG_FORMAT, + }, + }, + "handlers": { + "default": { + "level": log_level, + "formatter": "standard", + "class": "logging.StreamHandler", + }, + }, + "loggers": { + "": { + "handlers": ["default"], + "level": log_level, + "propagate": True, + }, + }, +} + +logging.config.dictConfig(logging_config) diff --git a/lottery_data_scraper/pennsylvania.py b/lottery_data_scraper/pennsylvania.py new file mode 100644 index 0000000..13ee267 --- /dev/null +++ b/lottery_data_scraper/pennsylvania.py @@ -0,0 +1,385 @@ +""" +Scrapes the Pennsylvania lottery website for scratch-off ticket +data and calculates the expected value for each game. + +Pennsylvania publishes the number of tickets printed and how many +tickets are printed at each prize level. + +We can calculated the expected value of a game by summing +the value of all the prizes and dividing that by the cost +of all the tickets. + +The palottery website has an "index" page that has links to every game. +Each individual game has a link to a "game rules" page. +We can start at the index and visit every game rules page, then we +can find the html table on that page which has the detailed prize +information and run our calculations. + +Website that we'll be scraping: +https://www.palottery.state.pa.us/Scratch-Offs/Active-Games.aspx + +Example usage: + python -m pennsylvania +Or: + LOGLEVEL=DEBUG USE_CACHE=True python -m pennsylvania + +The following behavior is configurable through shell environment variables. + +Set LOGLEVEL to print useful debug info to console. +LOGLEVEL=[DEBUG,INFO,WARNING,ERROR,CRITICAL] +Defaults to WARNING. + +Set USE_CACHE to cache responses. This speeds up development +and is nice to the servers we're hitting. +USE_CACHE=[True] +Defaults to False. Note: Setting this env variable to the string False +will cause it to use cache because the string "False" evaluates to Truthy. +Either set it to True or don't set it. +""" +import base64 +import sys +import traceback +from copy import deepcopy +import locale +import logging +import os +import re +from tempfile import gettempdir +from bs4 import BeautifulSoup as bs +import requests +from lottery_data_scraper.schemas import GameSchema + +logger = logging.getLogger(__name__) +locale.setlocale(locale.LC_MONETARY, "en_US.UTF-8") + +# It's worth assigning to constants values that are used in many +# places throughout a script. +BASE_URL = "https://www.palottery.state.pa.us" +INDEX_URL = f"{BASE_URL}/Scratch-Offs/Active-Games.aspx" + + +def fetch_html(url): + """ + Helper to fetch and cache html responses. + + During development and while testing, we'll be hitting the same urls often. + The content of the pages probably won't be changing. + Caching the results will speed up development, + and the servers will appreciate us for not spamming requests. + + The responses are cached in the operating systems tempfile directory. + That's probably /tmp/ or /var/tmp/ on Unix flavors and C:/temp/ on Windows. + The filename is based on the URL. But since the URL might contain + characters that are invalid for filenames, we base64 encode the URL. + """ + safe_filename = base64.urlsafe_b64encode(bytes(url, "utf-8")).decode("utf-8") + filepath = os.path.join(gettempdir(), safe_filename) + + if os.path.isfile(filepath) and os.environ.get("USE_CACHE", False): + with open(filepath, "r") as f: + return f.read() + else: + # We are relying on the outside world when we make a request, so we + # might want to wrap this in a try/except. But we'd + # only want to do that in two cases. + # + # 1. We have a way of handling exceptions, + # A good example would be to catch exceptions and retry the + # request; maybe the network was down. + # + # 2. We can't handle the exception, but we want to log something + # more useful than the stack trace that will get spit out if + # we just let the exception go uncaught. + # + # In this case, I don't think it's worth muddying up the code + # trying to handle exceptions here. It's easy enough to just re-run + # the script. + html = requests.get(url).text + if os.environ.get("USE_CACHE", False): + with open(filepath, "w+") as f: + f.write(html) + return html + + +def find_game_names(html): + """ + Game names can be found on the index page + in the text of anchor elements + which have the class "activeGame_li". + """ + soup = bs(html, "lxml") + game_elements = soup.find_all("a", class_="activeGame_li") + return [ + re.sub(r"\s+", " ", g.find("div", class_="info").text) for g in game_elements + ] + + +def find_game_urls(html): + """ + Luckily, all of the Pennsylvania games are listed on a single html page. + We don't have to mess around with any pagination and making multiple requests. + + The links are "href" attributes of anchor tags with the class "activeGame_li". + """ + soup = bs(html, "lxml") + game_elements = soup.find_all("a", class_="activeGame_li") + return ["{}{}".format(BASE_URL, e.attrs["href"]) for e in game_elements] + + +def find_complete_game_rules_url(html): + """ + Game pages have a link to the complete game rules. + The complete game rules have a table of all prizes for a game. + + The link to the game rules page is in an anchor tag + nested under a div with the class "instant-games-games-info". + """ + soup = bs(html, "lxml") + games_info_div = soup.find("div", class_="instant-games-games-info") + games_info_anchor = games_info_div.find_all("a")[1] + games_info_url = games_info_anchor.attrs["href"] + return games_info_url + + +def find_rows(html): + """ + From a game rules page, find the rows of the table + that have the number of tickets and the value of each prize. + """ + soup = bs(html, "lxml") + + # Some game rules pages have multiple tables. + # The first table has the prizes. + # soup.find returns the first matching element + # soup.find_all returns a list of all matching elements. + prize_table = soup.find("table") + row_elements = prize_table.find_all("tr") + + # The first row is headers so we sort of want + # to skip it for the calculations, but it includes + # an important bit of information that we want. + # The rows only contain winning ticket info. + # We also care about a row for the losing prize tier. + # It will have a value of "0" but we want to know + # how many losing tickets there are. + # + # We can calculate that from the first header. It + # contains the total number of tickets printed. + # Let's get the total number of tickets printed so + # we can subtract the sum of the number of winning + # giving us the number of losing tickets. + header_row = row_elements[0] + header_columns = header_row.find_all("th") + total_number_tickets = int(re.sub(r"\D", "", header_columns[-1].text)) + + row_elements = row_elements[1:] + + # We only care about the last and second to last columns. + # The following helper functions will help us parse + # the data we care about from each row. + # + # The last column is the number of tickets at this prize level. + # The number of tickets has commas, like 1,350,500. + # We'll have to parse them out. + # + # The second to last column is the prize value. + # Prize value is usually "$" followed by a number. + # Those are easy to parse. + # But for the free ticket prize it's "FREE $1 TICKET" + def parse_value(row_element): + columns = row_element.find_all("td") + try: + value_element = columns[-3] + value_text = value_element.text + return int(re.sub(r"\D", "", value_text)) + except Exception: + # This is an exception we can handle. + # We can simply return a value of 0 if + # the row doesn't have what we expect. + # Our result might be inaccurate, but + # I'll consider that acceptable. + # I'll log something useful so I know + # to look into it. + logger.warning("Exception parsing value for a row :%s", row_element.text) + return 0 + + def parse_num_tickets(row_element): + columns = row_element.find_all("td") + try: + num_tickets_element = columns[-1] + num_tickets_text = num_tickets_element.text + return int(num_tickets_text.replace(",", "")) + except: + # Same as above, we can handle this. + # Logging and returning 0 is better than blowing up. + logger.warning( + "Exception parsing num_tickets for a row.\n{}".format(row_element.text) + ) + return 0 + + # Iterate over each row and parse out the value of the prize tier + # and the number of remaining tickets at that prize tier. + rows = [(parse_value(e), parse_num_tickets(e)) for e in row_elements] + number_winning_tickets = sum(r[1] for r in rows) + + # Insert the losing ticket value, $0, and the number + # of losing tickets into our rows. + rows.insert(0, (0, total_number_tickets - number_winning_tickets)) + return rows + + +def find_price(html): + """ + Price is hard to find. It seems to always be a sibling to an + tag which has the text "Price". So, we can find that + tag, get the text of it's parent, find the last word of that text, + and that will be the price of the ticket as a string that looks like + "$10.", which we can then strip of the non-digits. + """ + soup = bs(html, "lxml") + price_element = soup.find(string="Price") + price_text = price_element.parent.parent.text.split(" ")[-1] + price = int(re.sub(r"\D", "", price_text)) + return price + + +def calculate_original_ev(game_url): + """ + The "expected value" or "return on investment" of a game + will be the total value of the remaining prizes + divided by the total cost of the remaining tickets. + + Imagine you bought every ticket that was printed. + + How much money would you spend? How much money would you get back in prizes? + + If you won $1,500,000 and spent $2,000,000 + then your expected value is 1,500,000 / 2,000,000 = 0.75. + + For every $1 spent on the game, you'll get back $0.75 + for an average loss of $0.25. + """ + game_html = fetch_html(game_url) + game_rules_url = find_complete_game_rules_url(game_html) + game_rules_html = fetch_html(game_rules_url) + price = find_price(game_rules_html) + rows = find_rows(game_rules_html) + total_number_tickets = sum(r[1] for r in rows) + total_value_tickets = sum(r[1] * r[0] for r in rows) + total_cost_tickets = total_number_tickets * price + ev = total_value_tickets / total_cost_tickets + return ev + + +def combine_prizes(prizes): + combined = [] + last_prize = prizes[0] + for prize in prizes[1:]: + if last_prize[-1] == prize[-1]: + last_prize[0] += prize[0] + else: + combined.append(last_prize) + last_prize = prize + combined.append(last_prize) + return combined + + +def parse_game_html(name, url, html): + game = {} + game_soup = bs(html, "lxml") + game["name"] = name.strip() + game["url"] = url + game["game_id"] = re.match(r".*?(\d+$)", url).group(1) + game_rules_url = find_complete_game_rules_url(html) + game_rules_html = fetch_html(game_rules_url) + game_rules_soup = bs(game_rules_html, "lxml") + game["price"] = find_price(game_rules_html) + prize_table = game_rules_soup.find("table", class_="miscr") + + def prize_value(p, price): + p = p.text.strip() + if re.search(r"FREE", p): + return price + else: + return p.replace("$", "").replace(",", "") + + prize_tuples = [ + [ + int(tds[-1].text.replace(",", "").strip()), + float(tds[-2].text.replace(",", "").strip()), + float(prize_value(tds[-3], game["price"])), + # float(tds[-3].text.replace("$", "").replace(",", "").strip()), + ] + for tds in [tr.find_all("td") for tr in prize_table.find_all("tr")[1:]] + ] + game["num_tx_initial"] = prize_tuples[-1][0] * prize_tuples[-1][1] + game["state"] = "pa" + combined_prizes = sorted(combine_prizes(deepcopy(prize_tuples)), key=lambda x: x[2]) + prizes_remaining_table = game_soup.find("table", class_="table-global").find( + "tbody" + ) + prizes_remaining = [ + [ + int(tds[1].text.strip()), + float(tds[0].text.replace("$", "").replace(",", "").strip()), + ] + for tds in [tr.find_all("td") for tr in prizes_remaining_table.find_all("tr")] + ] + percent_tx_remain = sum(p[0] for p in prizes_remaining) / sum( + p[0] for p in combined_prizes[: -len(prizes_remaining) - 1 : -1] + ) + combined_prizes = sorted( + [[p[0], p[2]] for p in combined_prizes], key=lambda x: -x[1] + ) + prizes = sorted(deepcopy(combined_prizes), key=lambda x: -x[1]) + prizes[: len(prizes_remaining)] = prizes_remaining + for prize in prizes[len(prizes_remaining) :]: + prize[0] = int(prize[0] * percent_tx_remain) + game_prizes = [] + for p, orig in zip(prizes, combined_prizes): + prize = {} + prize["available"] = p[0] + prize["claimed"] = orig[0] - p[0] + prize["value"] = p[1] + prize["prize"] = locale.currency(p[1], grouping=True) + game_prizes.append(prize) + game["prizes"] = game_prizes + return game + + +def main(): + index_html = fetch_html(INDEX_URL) + game_urls = find_game_urls(index_html) + game_names = find_game_names(index_html) + # Data will be a list of tuples that looks like: + # [(Ticket Price, Game Name, Expected Value), ...] + # + # The first element of the tuple of the list comprehension below + # is kind of confusing. We are iterating over game urls. + # We first fetch the html for the game url. Then we find the + # game rules url in that page. Then we fetch the html of the game rules + # page, then we find the price from that html. + # Hence: + # `find_price(fetch_html(find_complete_game_rules_url(fetch_html(url))))` + games = [] + + for name, url in list(zip(game_names, game_urls)): + try: + game_html = fetch_html(url) + except Exception as e: + logger.error("Error fetching %s: %s", url, e) + continue + try: + games.append(parse_game_html(name, url, game_html)) + except Exception as e: + t, b, tb = sys.exc_info() + tb_msg = "\n".join(traceback.format_tb(tb)) + logger.error("Unable to parse game {}.\n{}\n{}".format(name, e, tb_msg)) + + return games + + +if __name__ == "__main__": + games = main() + schema = GameSchema(many=True) + print(schema.dumps(games)) diff --git a/lottery_data_scraper/schemas.py b/lottery_data_scraper/schemas.py new file mode 100644 index 0000000..dd4b6d6 --- /dev/null +++ b/lottery_data_scraper/schemas.py @@ -0,0 +1,76 @@ +"""Some marshmallow schemas to do data validation and serialization. + +How to use: + +Create your model as a plain old Python object. + +Example: + + game = {} + game["game_id"] = "5" + game["price"] = 30 + game["state"] = "tx" + +Then create an instance of the schema. + + schema = GameSchema() + +Call `schema.dumps(game)` to "dump" your Python object to a string in JSON +format. + + >>> game = {"game_id": "5", "price": 30, "state": "tx", "created_at": datetime.utcnow()} + >>> schema = GameSchema() + >>> schema.dumps(game) + '{"game_id": "5", "state": "tx", "created_at": "2023-04-08T05:58:49.494561", "price": 30.0, "image_urls": "[]"}' + +And you can load a JSON string into a Python object with `schema.loads`. + + >>> schema.loads(schema.dumps(game)) + {'game_id': '5', 'state': 'tx', 'created_at': datetime.datetime(2023, 4, 8, 5, 58, 49, 494561), 'price': 30.0, 'image_urls': []} + +Some fields, like `game_id`, are required. You can validate a Python object by calling `schema.validate`. + + >>> game = {"price": 30, "state": "tx", "created_at": datetime.utcnow()} + >>> schema.dumps(game) + '{"state": "tx", "created_at": "2023-04-08T06:02:32.126541", "price": 30.0, "image_urls": "[]"}' + >>> schema.validate(game) + {'created_at': ['Not a valid datetime.']} +""" +from datetime import datetime +import json +from marshmallow import Schema, fields + + +class PrizeSchema(Schema): + class Meta: + render_module = json + + id = fields.Integer() + game_id = fields.Integer() + available = fields.Integer() + claimed = fields.Integer() + created_at = fields.DateTime(load_default=datetime.utcnow) + value = fields.Number() + prize = fields.Str() + + +class GameSchema(Schema): + class Meta: + render_module = json + + id = fields.Integer() + created_at = fields.DateTime(load_default=datetime.utcnow) + game_id = fields.Str(required=True) + name = fields.Str() + description = fields.Str() + image_urls = fields.Function( + lambda x: json.dumps(x.get("image_urls", [])), + deserialize=lambda x: json.loads(x), + ) + how_to_play = fields.Str() + num_tx_initial = fields.Integer() + price = fields.Number() + prizes = fields.Nested(PrizeSchema, many=True) + state = fields.Str() + updated_at = fields.DateTime() + url = fields.Str() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..3d1db5f --- /dev/null +++ b/setup.py @@ -0,0 +1,28 @@ +from setuptools import setup, find_packages + +setup( + name="lottery_data_scraper", + version="0.0.1", + author="Eric Ihli", + author_email="eihli@owoga.com", + url="https://github.com/owogac/lottery_data_scraper", + packages=find_packages(), + install_requires=[ + "beautifulsoup4", + "requests==2.28.2", + "urllib3==1.26.15", + "numpy", + "pandas", + "lxml", + "html2text", + "html5lib", + "marshmallow==3.19.0", + "selenium==3.141.0", + "pybind11", + # If you want to develop locally and don't want to mess around with + # Xvfb (https://en.wikipedia.org/wiki/Xvfb), then just comment out + # the next line before you run `python3 setup.py install`. + "xvfbwrapper==0.2.9", + "table_ocr==0.2.5", + ], +) diff --git a/tests/test_pennsylvania.py b/tests/test_pennsylvania.py new file mode 100644 index 0000000..7737299 --- /dev/null +++ b/tests/test_pennsylvania.py @@ -0,0 +1,23 @@ +import unittest +import requests + +from lottery_data_scraper import pennsylvania +from lottery_data_scraper import schemas + + +class TestPennsylvania(unittest.TestCase): + def test_parse_game_html(self): + # URL chosen arbitrarily + url = "https://www.palottery.state.pa.us/Scratch-Offs/View-Scratch-Off.aspx?id=3201" + html = pennsylvania.fetch_html(url) + game = pennsylvania.parse_game_html("$3 Million Mega Stacks", url, html) + self.assertEqual(game["name"], "$3 Million Mega Stacks") + self.assertEqual(game["price"], 30) + self.assertEqual( + game["url"], + "https://www.palottery.state.pa.us/Scratch-Offs/View-Scratch-Off.aspx?id=3201", + ) + self.assertEqual(game["game_id"], "3201") + self.assertEqual(game["prizes"][0]["prize"], "$3,000,000.00") + # Perhaps unfortunately in dollars. Cents would be better, eh? + self.assertEqual(game["prizes"][0]["value"], 3000000)