Add parser for Pennsylvania

2 years ago · fdafaae267
commit fdafaae267
10 changed files with 694 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 *.egg-info/
 *.pyc
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,12 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## [Unreleased]
 ### Added
 - Parser for Pennsylvania.
--- a/15
+++ b/15
@ -0,0 +1,15 @@
 ##
 # Lottery Data Scraper
 #
 # @file
 # @version 0.1
 FORCE:
 test: FORCE
 	python3 -m unittest discover tests
 style: FORCE
 	black .
 # end
--- a/README.md
+++ b/README.md
@ -0,0 +1,101 @@
 # Parsing of lottery websites
 ## Demo
 The following script should put you in a state where the last line will make a
 bunch of requests to the Pennsylvania lottery website, parse the tables of
 games/prizes, and print to your terminal a JSON structure of all of the games.
 ``` sh
 git clone https://github.com/owogawc/lottery_data_scraper
 cd lottery_data_scraper
 python3 -m venv ~/.virtualenvs/lottery_data_scraper
 . ~/.virtualenvs/lottery_data_scraper
 pip3 install -e .
 PY_LOG_LVL=DEBUG USE_CACHE=true python3 -m lottery_data_scraper.pennsylvania 
 ```
 If you have [jq](https://stedolan.github.io/jq/) installed, you can get some
 formatted output by piping it to `jq` (and redirecting STDERR to /dev/null).
 ``` sh
 PY_LOG_LVL=DEBUG USE_CACHE=true python3 -m lottery_data_scraper.pennsylvania 2> /dev/null | jq
 ```
 ## Data models
 We're using [`marshmallow`](https://marshmallow.readthedocs.io/en/stable/index.html) to validate and serialize data.
 I'm including the schemas here just so you can quickly get a general idea of
 what data fields we're able to scrape from most lottery websites. What you see
 in this README might not be up-to-date with what's in
 [schemas.py](./lottery_data_scraper/schemas.py).
 As of 2023-04-07 the schemas are a work-in-progress. The remaining TODO is to
 determine and specify which fields are absolutely required and which are
 optional.
 ### Game Schema
 ``` python
 class GameSchema(Schema):
    class Meta:
        render_module = json
    id = fields.Integer()
    created_at = fields.DateTime(load_default=datetime.utcnow)
    game_id = fields.Str()
    name = fields.Str()
    description = fields.Str()
    image_urls = fields.Function(
        lambda x: json.loads(x.image_urls) if x.image_urls else [],
        deserialize=lambda x: None if x.image_urls == [] else json.dumps(x.image_urls),
    )
    how_to_play = fields.Str()
    num_tx_initial = fields.Integer()
    price = fields.Number()
    prizes = fields.Nested(PrizeSchema, many=True)
    state = fields.Str()
    updated_at = fields.DateTime()
    url = fields.Str()
 ```
 ### Prize Schema
 ``` python
 class PrizeSchema(Schema):
    class Meta:
        render_module = json
    id = fields.Integer()
    game_id = fields.Integer()
    available = fields.Integer()
    claimed = fields.Integer()
    created_at = fields.DateTime(load_default=datetime.utcnow)
    value = fields.Number()
    prize = fields.Str()
 ```
 # Tests
 Testing is kind of tricky because you can't rely on _just_ python with its
 `requests` library. Some states have some scrape protections that require you
 actually run JavaScript. Some states have extreme scrape protection that require
 you to actually run a _display_. They check for some rendering context that
 doesn't exist when you run a headless browser in Selenium. To scrape those
 sites, you actually have to run a [X virtual
 framebuffer](https://en.wikipedia.org/wiki/Xvfb). Testing in these cases isn't
 as simple as running `python3 -m unittest discover`.
 # Contributing
 ``` sh
 git clone https://github.com/owogawc/lottery_data_scraper
 cd lottery_data_scraper
 python3 -m venv ~/.virtualenvs/lottery_data_scraper
 . ~/.virtualenvs/lottery_data_scraper
 pip3 install -e .
 ```
 Then you should be able to run `make test` and see the tests pass.
--- a/TODO.md
+++ b/TODO.md
@ -0,0 +1,12 @@
 # TODO
 - [ ] Add more states.
  - [ ] California
  - [ ] Georgia
  - [ ] Louisiana
  - [ ] Florida
  - [ ] Texas
  - [ ] New Mexico
 - [ ] [Publish to PYPI](https://packaging.python.org/en/latest/tutorials/packaging-projects/).
 - [ ] Do we still need Xvfb? Which states have that level of scrape protection?
 - [ ] Decide on and add a license.
--- a/lottery_data_scraper/init.py
+++ b/lottery_data_scraper/init.py
@ -0,0 +1,40 @@
 """
 Configure logging for the entire package.
 You can specify a log level with the environment variable
 PY_LOG_LVL=[debug|info|warning|error|critical]
 """
 import logging
 import logging.config
 import os
 # Prefix the basic format with a timestamp, file pathname, and line number.
 # See: https://docs.python.org/3/library/logging.html#logrecord-attributes
 LOG_FORMAT = "%(asctime)s %(pathname)s %(lineno)s {}".format(logging.BASIC_FORMAT)
 log_level = getattr(logging, os.environ.get("PY_LOG_LVL", "WARNING").upper())
 logging_config = {
    "version": 1,
    "formatters": {
        "standard": {
            "format": LOG_FORMAT,
        },
    },
    "handlers": {
        "default": {
            "level": log_level,
            "formatter": "standard",
            "class": "logging.StreamHandler",
        },
    },
    "loggers": {
        "": {
            "handlers": ["default"],
            "level": log_level,
            "propagate": True,
        },
    },
 }
 logging.config.dictConfig(logging_config)
--- a/lottery_data_scraper/pennsylvania.py
+++ b/lottery_data_scraper/pennsylvania.py
@ -0,0 +1,385 @@
 """
 Scrapes the Pennsylvania lottery website for scratch-off ticket
 data and calculates the expected value for each game.
 Pennsylvania publishes the number of tickets printed and how many
 tickets are printed at each prize level.
 We can calculated the expected value of a game by summing
 the value of all the prizes and dividing that by the cost
 of all the tickets.
 The palottery website has an "index" page that has links to every game.
 Each individual game has a link to a "game rules" page.
 We can start at the index and visit every game rules page, then we
 can find the html table on that page which has the detailed prize
 information and run our calculations.
 Website that we'll be scraping:
 https://www.palottery.state.pa.us/Scratch-Offs/Active-Games.aspx
 Example usage:
    python -m pennsylvania
 Or:
    LOGLEVEL=DEBUG USE_CACHE=True python -m pennsylvania
 The following behavior is configurable through shell environment variables.
 Set LOGLEVEL to print useful debug info to console.
 LOGLEVEL=[DEBUG,INFO,WARNING,ERROR,CRITICAL]
 Defaults to WARNING.
 Set USE_CACHE to cache responses. This speeds up development
 and is nice to the servers we're hitting.
 USE_CACHE=[True]
 Defaults to False. Note: Setting this env variable to the string False
 will cause it to use cache because the string "False" evaluates to Truthy.
 Either set it to True or don't set it.
 """
 import base64
 import sys
 import traceback
 from copy import deepcopy
 import locale
 import logging
 import os
 import re
 from tempfile import gettempdir
 from bs4 import BeautifulSoup as bs
 import requests
 from lottery_data_scraper.schemas import GameSchema
 logger = logging.getLogger(__name__)
 locale.setlocale(locale.LC_MONETARY, "en_US.UTF-8")
 # It's worth assigning to constants values that are used in many
 # places throughout a script.
 BASE_URL = "https://www.palottery.state.pa.us"
 INDEX_URL = f"{BASE_URL}/Scratch-Offs/Active-Games.aspx"
 def fetch_html(url):
    """
    Helper to fetch and cache html responses.
    During development and while testing, we'll be hitting the same urls often.
    The content of the pages probably won't be changing.
    Caching the results will speed up development,
    and the servers will appreciate us for not spamming requests.
    The responses are cached in the operating systems tempfile directory.
    That's probably /tmp/ or /var/tmp/ on Unix flavors and C:/temp/ on Windows.
    The filename is based on the URL. But since the URL might contain
    characters that are invalid for filenames, we base64 encode the URL.
    """
    safe_filename = base64.urlsafe_b64encode(bytes(url, "utf-8")).decode("utf-8")
    filepath = os.path.join(gettempdir(), safe_filename)
    if os.path.isfile(filepath) and os.environ.get("USE_CACHE", False):
        with open(filepath, "r") as f:
            return f.read()
    else:
        # We are relying on the outside world when we make a request, so we
        # might want to wrap this in a try/except. But we'd
        # only want to do that in two cases.
        #
        # 1. We have a way of handling exceptions,
        # A good example would be to catch exceptions and retry the
        # request; maybe the network was down.
        #
        # 2. We can't handle the exception, but we want to log something
        # more useful than the stack trace that will get spit out if
        # we just let the exception go uncaught.
        #
        # In this case, I don't think it's worth muddying up the code
        # trying to handle exceptions here. It's easy enough to just re-run
        # the script.
        html = requests.get(url).text
        if os.environ.get("USE_CACHE", False):
            with open(filepath, "w+") as f:
                f.write(html)
        return html
 def find_game_names(html):
    """
    Game names can be found on the index page
    in the text of anchor elements
    which have the class "activeGame_li".
    """
    soup = bs(html, "lxml")
    game_elements = soup.find_all("a", class_="activeGame_li")
    return [
        re.sub(r"\s+", " ", g.find("div", class_="info").text) for g in game_elements
    ]
 def find_game_urls(html):
    """
    Luckily, all of the Pennsylvania games are listed on a single html page.
    We don't have to mess around with any pagination and making multiple requests.
    The links are "href" attributes of anchor tags with the class "activeGame_li".
    """
    soup = bs(html, "lxml")
    game_elements = soup.find_all("a", class_="activeGame_li")
    return ["{}{}".format(BASE_URL, e.attrs["href"]) for e in game_elements]
 def find_complete_game_rules_url(html):
    """
    Game pages have a link to the complete game rules.
    The complete game rules have a table of all prizes for a game.
    The link to the game rules page is in an anchor tag
    nested under a div with the class "instant-games-games-info".
    """
    soup = bs(html, "lxml")
    games_info_div = soup.find("div", class_="instant-games-games-info")
    games_info_anchor = games_info_div.find_all("a")[1]
    games_info_url = games_info_anchor.attrs["href"]
    return games_info_url
 def find_rows(html):
    """
    From a game rules page, find the rows of the table
    that have the number of tickets and the value of each prize.
    """
    soup = bs(html, "lxml")
    # Some game rules pages have multiple tables.
    # The first table has the prizes.
    # soup.find returns the first matching element
    # soup.find_all returns a list of all matching elements.
    prize_table = soup.find("table")
    row_elements = prize_table.find_all("tr")
    # The first row is headers so we sort of want
    # to skip it for the calculations, but it includes
    # an important bit of information that we want.
    # The rows only contain winning ticket info.
    # We also care about a row for the losing prize tier.
    # It will have a value of "0" but we want to know
    # how many losing tickets there are.
    #
    # We can calculate that from the first header. It
    # contains the total number of tickets printed.
    # Let's get the total number of tickets printed so
    # we can subtract the sum of the number of winning
    # giving us the number of losing tickets.
    header_row = row_elements[0]
    header_columns = header_row.find_all("th")
    total_number_tickets = int(re.sub(r"\D", "", header_columns[-1].text))
    row_elements = row_elements[1:]
    # We only care about the last and second to last columns.
    # The following helper functions will help us parse
    # the data we care about from each row.
    #
    # The last column is the number of tickets at this prize level.
    # The number of tickets has commas, like 1,350,500.
    # We'll have to parse them out.
    #
    # The second to last column is the prize value.
    # Prize value is usually "$" followed by a number.
    # Those are easy to parse.
    # But for the free ticket prize it's "FREE $1 TICKET"
    def parse_value(row_element):
        columns = row_element.find_all("td")
        try:
            value_element = columns[-3]
            value_text = value_element.text
            return int(re.sub(r"\D", "", value_text))
        except Exception:
            # This is an exception we can handle.
            # We can simply return a value of 0 if
            # the row doesn't have what we expect.
            # Our result might be inaccurate, but
            # I'll consider that acceptable.
            # I'll log something useful so I know
            # to look into it.
            logger.warning("Exception parsing value for a row :%s", row_element.text)
            return 0
    def parse_num_tickets(row_element):
        columns = row_element.find_all("td")
        try:
            num_tickets_element = columns[-1]
            num_tickets_text = num_tickets_element.text
            return int(num_tickets_text.replace(",", ""))
        except:
            # Same as above, we can handle this.
            # Logging and returning 0 is better than blowing up.
            logger.warning(
                "Exception parsing num_tickets for a row.\n{}".format(row_element.text)
            )
            return 0
    # Iterate over each row and parse out the value of the prize tier
    # and the number of remaining tickets at that prize tier.
    rows = [(parse_value(e), parse_num_tickets(e)) for e in row_elements]
    number_winning_tickets = sum(r[1] for r in rows)
    # Insert the losing ticket value, $0, and the number
    # of losing tickets into our rows.
    rows.insert(0, (0, total_number_tickets - number_winning_tickets))
    return rows
 def find_price(html):
    """
    Price is hard to find. It seems to always be a sibling to an
    <i> tag which has the text "Price". So, we can find that <i>
    tag, get the text of it's parent, find the last word of that text,
    and that will be the price of the ticket as a string that looks like
    "$10.", which we can then strip of the non-digits.
    """
    soup = bs(html, "lxml")
    price_element = soup.find(string="Price")
    price_text = price_element.parent.parent.text.split(" ")[-1]
    price = int(re.sub(r"\D", "", price_text))
    return price
 def calculate_original_ev(game_url):
    """
    The "expected value" or "return on investment" of a game
    will be the total value of the remaining prizes
    divided by the total cost of the remaining tickets.
    Imagine you bought every ticket that was printed.
    How much money would you spend? How much money would you get back in prizes?
    If you won $1,500,000 and spent $2,000,000
    then your expected value is 1,500,000 / 2,000,000 = 0.75.
    For every $1 spent on the game, you'll get back $0.75
    for an average loss of $0.25.
    """
    game_html = fetch_html(game_url)
    game_rules_url = find_complete_game_rules_url(game_html)
    game_rules_html = fetch_html(game_rules_url)
    price = find_price(game_rules_html)
    rows = find_rows(game_rules_html)
    total_number_tickets = sum(r[1] for r in rows)
    total_value_tickets = sum(r[1] * r[0] for r in rows)
    total_cost_tickets = total_number_tickets * price
    ev = total_value_tickets / total_cost_tickets
    return ev
 def combine_prizes(prizes):
    combined = []
    last_prize = prizes[0]
    for prize in prizes[1:]:
        if last_prize[-1] == prize[-1]:
            last_prize[0] += prize[0]
        else:
            combined.append(last_prize)
            last_prize = prize
    combined.append(last_prize)
    return combined
 def parse_game_html(name, url, html):
    game = {}
    game_soup = bs(html, "lxml")
    game["name"] = name.strip()
    game["url"] = url
    game["game_id"] = re.match(r".*?(\d+$)", url).group(1)
    game_rules_url = find_complete_game_rules_url(html)
    game_rules_html = fetch_html(game_rules_url)
    game_rules_soup = bs(game_rules_html, "lxml")
    game["price"] = find_price(game_rules_html)
    prize_table = game_rules_soup.find("table", class_="miscr")
    def prize_value(p, price):
        p = p.text.strip()
        if re.search(r"FREE", p):
            return price
        else:
            return p.replace("$", "").replace(",", "")
    prize_tuples = [
        [
            int(tds[-1].text.replace(",", "").strip()),
            float(tds[-2].text.replace(",", "").strip()),
            float(prize_value(tds[-3], game["price"])),
            # float(tds[-3].text.replace("$", "").replace(",", "").strip()),
        ]
        for tds in [tr.find_all("td") for tr in prize_table.find_all("tr")[1:]]
    ]
    game["num_tx_initial"] = prize_tuples[-1][0] * prize_tuples[-1][1]
    game["state"] = "pa"
    combined_prizes = sorted(combine_prizes(deepcopy(prize_tuples)), key=lambda x: x[2])
    prizes_remaining_table = game_soup.find("table", class_="table-global").find(
        "tbody"
    )
    prizes_remaining = [
        [
            int(tds[1].text.strip()),
            float(tds[0].text.replace("$", "").replace(",", "").strip()),
        ]
        for tds in [tr.find_all("td") for tr in prizes_remaining_table.find_all("tr")]
    ]
    percent_tx_remain = sum(p[0] for p in prizes_remaining) / sum(
        p[0] for p in combined_prizes[: -len(prizes_remaining) - 1 : -1]
    )
    combined_prizes = sorted(
        [[p[0], p[2]] for p in combined_prizes], key=lambda x: -x[1]
    )
    prizes = sorted(deepcopy(combined_prizes), key=lambda x: -x[1])
    prizes[: len(prizes_remaining)] = prizes_remaining
    for prize in prizes[len(prizes_remaining) :]:
        prize[0] = int(prize[0] * percent_tx_remain)
    game_prizes = []
    for p, orig in zip(prizes, combined_prizes):
        prize = {}
        prize["available"] = p[0]
        prize["claimed"] = orig[0] - p[0]
        prize["value"] = p[1]
        prize["prize"] = locale.currency(p[1], grouping=True)
        game_prizes.append(prize)
    game["prizes"] = game_prizes
    return game
 def main():
    index_html = fetch_html(INDEX_URL)
    game_urls = find_game_urls(index_html)
    game_names = find_game_names(index_html)
    # Data will be a list of tuples that looks like:
    # [(Ticket Price, Game Name, Expected Value), ...]
    #
    # The first element of the tuple of the list comprehension below
    # is kind of confusing. We are iterating over game urls.
    # We first fetch the html for the game url. Then we find the
    # game rules url in that page. Then we fetch the html of the game rules
    # page, then we find the price from that html.
    # Hence:
    #     `find_price(fetch_html(find_complete_game_rules_url(fetch_html(url))))`
    games = []
    for name, url in list(zip(game_names, game_urls)):
        try:
            game_html = fetch_html(url)
        except Exception as e:
            logger.error("Error fetching %s: %s", url, e)
            continue
        try:
            games.append(parse_game_html(name, url, game_html))
        except Exception as e:
            t, b, tb = sys.exc_info()
            tb_msg = "\n".join(traceback.format_tb(tb))
            logger.error("Unable to parse game {}.\n{}\n{}".format(name, e, tb_msg))
    return games
 if __name__ == "__main__":
    games = main()
    schema = GameSchema(many=True)
    print(schema.dumps(games))
--- a/lottery_data_scraper/schemas.py
+++ b/lottery_data_scraper/schemas.py
@ -0,0 +1,76 @@
 """Some marshmallow schemas to do data validation and serialization.
 How to use:
 Create your model as a plain old Python object.
 Example:
    game = {}
    game["game_id"] = "5"
    game["price"] = 30
    game["state"] = "tx"
 Then create an instance of the schema.
    schema = GameSchema()
 Call `schema.dumps(game)` to "dump" your Python object to a string in JSON
 format.
    >>> game = {"game_id": "5", "price": 30, "state": "tx", "created_at": datetime.utcnow()}
    >>> schema = GameSchema()
    >>> schema.dumps(game)
    '{"game_id": "5", "state": "tx", "created_at": "2023-04-08T05:58:49.494561", "price": 30.0, "image_urls": "[]"}'
 And you can load a JSON string into a Python object with `schema.loads`.
    >>> schema.loads(schema.dumps(game))
    {'game_id': '5', 'state': 'tx', 'created_at': datetime.datetime(2023, 4, 8, 5, 58, 49, 494561), 'price': 30.0, 'image_urls': []}
 Some fields, like `game_id`, are required. You can validate a Python object by calling `schema.validate`.
    >>> game = {"price": 30, "state": "tx", "created_at": datetime.utcnow()}
    >>> schema.dumps(game)
    '{"state": "tx", "created_at": "2023-04-08T06:02:32.126541", "price": 30.0, "image_urls": "[]"}'
    >>> schema.validate(game)
    {'created_at': ['Not a valid datetime.']}
 """
 from datetime import datetime
 import json
 from marshmallow import Schema, fields
 class PrizeSchema(Schema):
    class Meta:
        render_module = json
    id = fields.Integer()
    game_id = fields.Integer()
    available = fields.Integer()
    claimed = fields.Integer()
    created_at = fields.DateTime(load_default=datetime.utcnow)
    value = fields.Number()
    prize = fields.Str()
 class GameSchema(Schema):
    class Meta:
        render_module = json
    id = fields.Integer()
    created_at = fields.DateTime(load_default=datetime.utcnow)
    game_id = fields.Str(required=True)
    name = fields.Str()
    description = fields.Str()
    image_urls = fields.Function(
        lambda x: json.dumps(x.get("image_urls", [])),
        deserialize=lambda x: json.loads(x),
    )
    how_to_play = fields.Str()
    num_tx_initial = fields.Integer()
    price = fields.Number()
    prizes = fields.Nested(PrizeSchema, many=True)
    state = fields.Str()
    updated_at = fields.DateTime()
    url = fields.Str()
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,28 @@
 from setuptools import setup, find_packages
 setup(
    name="lottery_data_scraper",
    version="0.0.1",
    author="Eric Ihli",
    author_email="eihli@owoga.com",
    url="https://github.com/owogac/lottery_data_scraper",
    packages=find_packages(),
    install_requires=[
        "beautifulsoup4",
        "requests==2.28.2",
        "urllib3==1.26.15",
        "numpy",
        "pandas",
        "lxml",
        "html2text",
        "html5lib",
        "marshmallow==3.19.0",
        "selenium==3.141.0",
        "pybind11",
        # If you want to develop locally and don't want to mess around with
        # Xvfb (https://en.wikipedia.org/wiki/Xvfb), then just comment out
        # the next line before you run `python3 setup.py install`.
        "xvfbwrapper==0.2.9",
        "table_ocr==0.2.5",
    ],
 )
--- a/tests/test_pennsylvania.py
+++ b/tests/test_pennsylvania.py
@ -0,0 +1,23 @@
 import unittest
 import requests
 from lottery_data_scraper import pennsylvania
 from lottery_data_scraper import schemas
 class TestPennsylvania(unittest.TestCase):
    def test_parse_game_html(self):
        # URL chosen arbitrarily
        url = "https://www.palottery.state.pa.us/Scratch-Offs/View-Scratch-Off.aspx?id=3201"
        html = pennsylvania.fetch_html(url)
        game = pennsylvania.parse_game_html("$3 Million Mega Stacks", url, html)
        self.assertEqual(game["name"], "$3 Million Mega Stacks")
        self.assertEqual(game["price"], 30)
        self.assertEqual(
            game["url"],
            "https://www.palottery.state.pa.us/Scratch-Offs/View-Scratch-Off.aspx?id=3201",
        )
        self.assertEqual(game["game_id"], "3201")
        self.assertEqual(game["prizes"][0]["prize"], "$3,000,000.00")
        # Perhaps unfortunately in dollars. Cents would be better, eh?
        self.assertEqual(game["prizes"][0]["value"], 3000000)