From fdafaae2672a6587aa9b364becc54c29b47562e5 Mon Sep 17 00:00:00 2001
From: Eric Ihli <eihli@owoga.com>
Date: Fri, 7 Apr 2023 23:32:33 -0700
Subject: [PATCH] Add parser for Pennsylvania

---
 .gitignore                           |   2 +
 CHANGELOG.md                         |  12 +
 Makefile                             |  15 ++
 README.md                            | 101 +++++++
 TODO.md                              |  12 +
 lottery_data_scraper/__init__.py     |  40 +++
 lottery_data_scraper/pennsylvania.py | 385 +++++++++++++++++++++++++++
 lottery_data_scraper/schemas.py      |  76 ++++++
 setup.py                             |  28 ++
 tests/test_pennsylvania.py           |  23 ++
 10 files changed, 694 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 CHANGELOG.md
 create mode 100644 Makefile
 create mode 100644 README.md
 create mode 100644 TODO.md
 create mode 100644 lottery_data_scraper/__init__.py
 create mode 100644 lottery_data_scraper/pennsylvania.py
 create mode 100644 lottery_data_scraper/schemas.py
 create mode 100644 setup.py
 create mode 100644 tests/test_pennsylvania.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..32aa08a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.egg-info/
+*.pyc
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..8f77428
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,12 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+### Added
+
+- Parser for Pennsylvania.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..0e0e4e2
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,15 @@
+##
+# Lottery Data Scraper
+#
+# @file
+# @version 0.1
+
+FORCE:
+
+test: FORCE
+	python3 -m unittest discover tests
+
+style: FORCE
+	black .
+
+# end
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..dcd8bc1
--- /dev/null
+++ b/README.md
@@ -0,0 +1,101 @@
+# Parsing of lottery websites
+
+## Demo
+
+The following script should put you in a state where the last line will make a
+bunch of requests to the Pennsylvania lottery website, parse the tables of
+games/prizes, and print to your terminal a JSON structure of all of the games.
+
+``` sh
+git clone https://github.com/owogawc/lottery_data_scraper
+cd lottery_data_scraper
+python3 -m venv ~/.virtualenvs/lottery_data_scraper
+. ~/.virtualenvs/lottery_data_scraper
+pip3 install -e .
+
+PY_LOG_LVL=DEBUG USE_CACHE=true python3 -m lottery_data_scraper.pennsylvania 
+```
+
+If you have [jq](https://stedolan.github.io/jq/) installed, you can get some
+formatted output by piping it to `jq` (and redirecting STDERR to /dev/null).
+
+``` sh
+PY_LOG_LVL=DEBUG USE_CACHE=true python3 -m lottery_data_scraper.pennsylvania 2> /dev/null | jq
+```
+
+## Data models
+
+We're using [`marshmallow`](https://marshmallow.readthedocs.io/en/stable/index.html) to validate and serialize data.
+
+I'm including the schemas here just so you can quickly get a general idea of
+what data fields we're able to scrape from most lottery websites. What you see
+in this README might not be up-to-date with what's in
+[schemas.py](./lottery_data_scraper/schemas.py).
+
+As of 2023-04-07 the schemas are a work-in-progress. The remaining TODO is to
+determine and specify which fields are absolutely required and which are
+optional.
+
+### Game Schema
+
+``` python
+class GameSchema(Schema):
+    class Meta:
+        render_module = json
+
+    id = fields.Integer()
+    created_at = fields.DateTime(load_default=datetime.utcnow)
+    game_id = fields.Str()
+    name = fields.Str()
+    description = fields.Str()
+    image_urls = fields.Function(
+        lambda x: json.loads(x.image_urls) if x.image_urls else [],
+        deserialize=lambda x: None if x.image_urls == [] else json.dumps(x.image_urls),
+    )
+    how_to_play = fields.Str()
+    num_tx_initial = fields.Integer()
+    price = fields.Number()
+    prizes = fields.Nested(PrizeSchema, many=True)
+    state = fields.Str()
+    updated_at = fields.DateTime()
+    url = fields.Str()
+```
+
+### Prize Schema
+
+``` python
+class PrizeSchema(Schema):
+    class Meta:
+        render_module = json
+
+    id = fields.Integer()
+    game_id = fields.Integer()
+    available = fields.Integer()
+    claimed = fields.Integer()
+    created_at = fields.DateTime(load_default=datetime.utcnow)
+    value = fields.Number()
+    prize = fields.Str()
+```
+
+# Tests
+
+Testing is kind of tricky because you can't rely on _just_ python with its
+`requests` library. Some states have some scrape protections that require you
+actually run JavaScript. Some states have extreme scrape protection that require
+you to actually run a _display_. They check for some rendering context that
+doesn't exist when you run a headless browser in Selenium. To scrape those
+sites, you actually have to run a [X virtual
+framebuffer](https://en.wikipedia.org/wiki/Xvfb). Testing in these cases isn't
+as simple as running `python3 -m unittest discover`.
+
+# Contributing
+
+``` sh
+git clone https://github.com/owogawc/lottery_data_scraper
+cd lottery_data_scraper
+python3 -m venv ~/.virtualenvs/lottery_data_scraper
+. ~/.virtualenvs/lottery_data_scraper
+pip3 install -e .
+```
+
+Then you should be able to run `make test` and see the tests pass.
diff --git a/TODO.md b/TODO.md
new file mode 100644
index 0000000..1d23a60
--- /dev/null
+++ b/TODO.md
@@ -0,0 +1,12 @@
+# TODO
+
+- [ ] Add more states.
+  - [ ] California
+  - [ ] Georgia
+  - [ ] Louisiana
+  - [ ] Florida
+  - [ ] Texas
+  - [ ] New Mexico
+- [ ] [Publish to PYPI](https://packaging.python.org/en/latest/tutorials/packaging-projects/).
+- [ ] Do we still need Xvfb? Which states have that level of scrape protection?
+- [ ] Decide on and add a license.
diff --git a/lottery_data_scraper/__init__.py b/lottery_data_scraper/__init__.py
new file mode 100644
index 0000000..a0e1b91
--- /dev/null
+++ b/lottery_data_scraper/__init__.py
@@ -0,0 +1,40 @@
+"""
+Configure logging for the entire package.
+
+You can specify a log level with the environment variable
+PY_LOG_LVL=[debug|info|warning|error|critical]
+"""
+import logging
+import logging.config
+import os
+
+
+# Prefix the basic format with a timestamp, file pathname, and line number.
+# See: https://docs.python.org/3/library/logging.html#logrecord-attributes
+LOG_FORMAT = "%(asctime)s %(pathname)s %(lineno)s {}".format(logging.BASIC_FORMAT)
+
+log_level = getattr(logging, os.environ.get("PY_LOG_LVL", "WARNING").upper())
+logging_config = {
+    "version": 1,
+    "formatters": {
+        "standard": {
+            "format": LOG_FORMAT,
+        },
+    },
+    "handlers": {
+        "default": {
+            "level": log_level,
+            "formatter": "standard",
+            "class": "logging.StreamHandler",
+        },
+    },
+    "loggers": {
+        "": {
+            "handlers": ["default"],
+            "level": log_level,
+            "propagate": True,
+        },
+    },
+}
+
+logging.config.dictConfig(logging_config)
diff --git a/lottery_data_scraper/pennsylvania.py b/lottery_data_scraper/pennsylvania.py
new file mode 100644
index 0000000..13ee267
--- /dev/null
+++ b/lottery_data_scraper/pennsylvania.py
@@ -0,0 +1,385 @@
+"""
+Scrapes the Pennsylvania lottery website for scratch-off ticket
+data and calculates the expected value for each game.
+
+Pennsylvania publishes the number of tickets printed and how many
+tickets are printed at each prize level.
+
+We can calculated the expected value of a game by summing
+the value of all the prizes and dividing that by the cost
+of all the tickets.
+
+The palottery website has an "index" page that has links to every game.
+Each individual game has a link to a "game rules" page.
+We can start at the index and visit every game rules page, then we
+can find the html table on that page which has the detailed prize
+information and run our calculations.
+
+Website that we'll be scraping:
+https://www.palottery.state.pa.us/Scratch-Offs/Active-Games.aspx
+
+Example usage:
+    python -m pennsylvania
+Or:
+    LOGLEVEL=DEBUG USE_CACHE=True python -m pennsylvania
+
+The following behavior is configurable through shell environment variables.
+
+Set LOGLEVEL to print useful debug info to console.
+LOGLEVEL=[DEBUG,INFO,WARNING,ERROR,CRITICAL]
+Defaults to WARNING.
+
+Set USE_CACHE to cache responses. This speeds up development
+and is nice to the servers we're hitting.
+USE_CACHE=[True]
+Defaults to False. Note: Setting this env variable to the string False
+will cause it to use cache because the string "False" evaluates to Truthy.
+Either set it to True or don't set it.
+"""
+import base64
+import sys
+import traceback
+from copy import deepcopy
+import locale
+import logging
+import os
+import re
+from tempfile import gettempdir
+from bs4 import BeautifulSoup as bs
+import requests
+from lottery_data_scraper.schemas import GameSchema
+
+logger = logging.getLogger(__name__)
+locale.setlocale(locale.LC_MONETARY, "en_US.UTF-8")
+
+# It's worth assigning to constants values that are used in many
+# places throughout a script.
+BASE_URL = "https://www.palottery.state.pa.us"
+INDEX_URL = f"{BASE_URL}/Scratch-Offs/Active-Games.aspx"
+
+
+def fetch_html(url):
+    """
+    Helper to fetch and cache html responses.
+
+    During development and while testing, we'll be hitting the same urls often.
+    The content of the pages probably won't be changing.
+    Caching the results will speed up development,
+    and the servers will appreciate us for not spamming requests.
+
+    The responses are cached in the operating systems tempfile directory.
+    That's probably /tmp/ or /var/tmp/ on Unix flavors and C:/temp/ on Windows.
+    The filename is based on the URL. But since the URL might contain
+    characters that are invalid for filenames, we base64 encode the URL.
+    """
+    safe_filename = base64.urlsafe_b64encode(bytes(url, "utf-8")).decode("utf-8")
+    filepath = os.path.join(gettempdir(), safe_filename)
+
+    if os.path.isfile(filepath) and os.environ.get("USE_CACHE", False):
+        with open(filepath, "r") as f:
+            return f.read()
+    else:
+        # We are relying on the outside world when we make a request, so we
+        # might want to wrap this in a try/except. But we'd
+        # only want to do that in two cases.
+        #
+        # 1. We have a way of handling exceptions,
+        # A good example would be to catch exceptions and retry the
+        # request; maybe the network was down.
+        #
+        # 2. We can't handle the exception, but we want to log something
+        # more useful than the stack trace that will get spit out if
+        # we just let the exception go uncaught.
+        #
+        # In this case, I don't think it's worth muddying up the code
+        # trying to handle exceptions here. It's easy enough to just re-run
+        # the script.
+        html = requests.get(url).text
+        if os.environ.get("USE_CACHE", False):
+            with open(filepath, "w+") as f:
+                f.write(html)
+        return html
+
+
+def find_game_names(html):
+    """
+    Game names can be found on the index page
+    in the text of anchor elements
+    which have the class "activeGame_li".
+    """
+    soup = bs(html, "lxml")
+    game_elements = soup.find_all("a", class_="activeGame_li")
+    return [
+        re.sub(r"\s+", " ", g.find("div", class_="info").text) for g in game_elements
+    ]
+
+
+def find_game_urls(html):
+    """
+    Luckily, all of the Pennsylvania games are listed on a single html page.
+    We don't have to mess around with any pagination and making multiple requests.
+
+    The links are "href" attributes of anchor tags with the class "activeGame_li".
+    """
+    soup = bs(html, "lxml")
+    game_elements = soup.find_all("a", class_="activeGame_li")
+    return ["{}{}".format(BASE_URL, e.attrs["href"]) for e in game_elements]
+
+
+def find_complete_game_rules_url(html):
+    """
+    Game pages have a link to the complete game rules.
+    The complete game rules have a table of all prizes for a game.
+
+    The link to the game rules page is in an anchor tag
+    nested under a div with the class "instant-games-games-info".
+    """
+    soup = bs(html, "lxml")
+    games_info_div = soup.find("div", class_="instant-games-games-info")
+    games_info_anchor = games_info_div.find_all("a")[1]
+    games_info_url = games_info_anchor.attrs["href"]
+    return games_info_url
+
+
+def find_rows(html):
+    """
+    From a game rules page, find the rows of the table
+    that have the number of tickets and the value of each prize.
+    """
+    soup = bs(html, "lxml")
+
+    # Some game rules pages have multiple tables.
+    # The first table has the prizes.
+    # soup.find returns the first matching element
+    # soup.find_all returns a list of all matching elements.
+    prize_table = soup.find("table")
+    row_elements = prize_table.find_all("tr")
+
+    # The first row is headers so we sort of want
+    # to skip it for the calculations, but it includes
+    # an important bit of information that we want.
+    # The rows only contain winning ticket info.
+    # We also care about a row for the losing prize tier.
+    # It will have a value of "0" but we want to know
+    # how many losing tickets there are.
+    #
+    # We can calculate that from the first header. It
+    # contains the total number of tickets printed.
+    # Let's get the total number of tickets printed so
+    # we can subtract the sum of the number of winning
+    # giving us the number of losing tickets.
+    header_row = row_elements[0]
+    header_columns = header_row.find_all("th")
+    total_number_tickets = int(re.sub(r"\D", "", header_columns[-1].text))
+
+    row_elements = row_elements[1:]
+
+    # We only care about the last and second to last columns.
+    # The following helper functions will help us parse
+    # the data we care about from each row.
+    #
+    # The last column is the number of tickets at this prize level.
+    # The number of tickets has commas, like 1,350,500.
+    # We'll have to parse them out.
+    #
+    # The second to last column is the prize value.
+    # Prize value is usually "$" followed by a number.
+    # Those are easy to parse.
+    # But for the free ticket prize it's "FREE $1 TICKET"
+    def parse_value(row_element):
+        columns = row_element.find_all("td")
+        try:
+            value_element = columns[-3]
+            value_text = value_element.text
+            return int(re.sub(r"\D", "", value_text))
+        except Exception:
+            # This is an exception we can handle.
+            # We can simply return a value of 0 if
+            # the row doesn't have what we expect.
+            # Our result might be inaccurate, but
+            # I'll consider that acceptable.
+            # I'll log something useful so I know
+            # to look into it.
+            logger.warning("Exception parsing value for a row :%s", row_element.text)
+            return 0
+
+    def parse_num_tickets(row_element):
+        columns = row_element.find_all("td")
+        try:
+            num_tickets_element = columns[-1]
+            num_tickets_text = num_tickets_element.text
+            return int(num_tickets_text.replace(",", ""))
+        except:
+            # Same as above, we can handle this.
+            # Logging and returning 0 is better than blowing up.
+            logger.warning(
+                "Exception parsing num_tickets for a row.\n{}".format(row_element.text)
+            )
+            return 0
+
+    # Iterate over each row and parse out the value of the prize tier
+    # and the number of remaining tickets at that prize tier.
+    rows = [(parse_value(e), parse_num_tickets(e)) for e in row_elements]
+    number_winning_tickets = sum(r[1] for r in rows)
+
+    # Insert the losing ticket value, $0, and the number
+    # of losing tickets into our rows.
+    rows.insert(0, (0, total_number_tickets - number_winning_tickets))
+    return rows
+
+
+def find_price(html):
+    """
+    Price is hard to find. It seems to always be a sibling to an
+    <i> tag which has the text "Price". So, we can find that <i>
+    tag, get the text of it's parent, find the last word of that text,
+    and that will be the price of the ticket as a string that looks like
+    "$10.", which we can then strip of the non-digits.
+    """
+    soup = bs(html, "lxml")
+    price_element = soup.find(string="Price")
+    price_text = price_element.parent.parent.text.split(" ")[-1]
+    price = int(re.sub(r"\D", "", price_text))
+    return price
+
+
+def calculate_original_ev(game_url):
+    """
+    The "expected value" or "return on investment" of a game
+    will be the total value of the remaining prizes
+    divided by the total cost of the remaining tickets.
+
+    Imagine you bought every ticket that was printed.
+
+    How much money would you spend? How much money would you get back in prizes?
+
+    If you won $1,500,000 and spent $2,000,000
+    then your expected value is 1,500,000 / 2,000,000 = 0.75.
+
+    For every $1 spent on the game, you'll get back $0.75
+    for an average loss of $0.25.
+    """
+    game_html = fetch_html(game_url)
+    game_rules_url = find_complete_game_rules_url(game_html)
+    game_rules_html = fetch_html(game_rules_url)
+    price = find_price(game_rules_html)
+    rows = find_rows(game_rules_html)
+    total_number_tickets = sum(r[1] for r in rows)
+    total_value_tickets = sum(r[1] * r[0] for r in rows)
+    total_cost_tickets = total_number_tickets * price
+    ev = total_value_tickets / total_cost_tickets
+    return ev
+
+
+def combine_prizes(prizes):
+    combined = []
+    last_prize = prizes[0]
+    for prize in prizes[1:]:
+        if last_prize[-1] == prize[-1]:
+            last_prize[0] += prize[0]
+        else:
+            combined.append(last_prize)
+            last_prize = prize
+    combined.append(last_prize)
+    return combined
+
+
+def parse_game_html(name, url, html):
+    game = {}
+    game_soup = bs(html, "lxml")
+    game["name"] = name.strip()
+    game["url"] = url
+    game["game_id"] = re.match(r".*?(\d+$)", url).group(1)
+    game_rules_url = find_complete_game_rules_url(html)
+    game_rules_html = fetch_html(game_rules_url)
+    game_rules_soup = bs(game_rules_html, "lxml")
+    game["price"] = find_price(game_rules_html)
+    prize_table = game_rules_soup.find("table", class_="miscr")
+
+    def prize_value(p, price):
+        p = p.text.strip()
+        if re.search(r"FREE", p):
+            return price
+        else:
+            return p.replace("$", "").replace(",", "")
+
+    prize_tuples = [
+        [
+            int(tds[-1].text.replace(",", "").strip()),
+            float(tds[-2].text.replace(",", "").strip()),
+            float(prize_value(tds[-3], game["price"])),
+            # float(tds[-3].text.replace("$", "").replace(",", "").strip()),
+        ]
+        for tds in [tr.find_all("td") for tr in prize_table.find_all("tr")[1:]]
+    ]
+    game["num_tx_initial"] = prize_tuples[-1][0] * prize_tuples[-1][1]
+    game["state"] = "pa"
+    combined_prizes = sorted(combine_prizes(deepcopy(prize_tuples)), key=lambda x: x[2])
+    prizes_remaining_table = game_soup.find("table", class_="table-global").find(
+        "tbody"
+    )
+    prizes_remaining = [
+        [
+            int(tds[1].text.strip()),
+            float(tds[0].text.replace("$", "").replace(",", "").strip()),
+        ]
+        for tds in [tr.find_all("td") for tr in prizes_remaining_table.find_all("tr")]
+    ]
+    percent_tx_remain = sum(p[0] for p in prizes_remaining) / sum(
+        p[0] for p in combined_prizes[: -len(prizes_remaining) - 1 : -1]
+    )
+    combined_prizes = sorted(
+        [[p[0], p[2]] for p in combined_prizes], key=lambda x: -x[1]
+    )
+    prizes = sorted(deepcopy(combined_prizes), key=lambda x: -x[1])
+    prizes[: len(prizes_remaining)] = prizes_remaining
+    for prize in prizes[len(prizes_remaining) :]:
+        prize[0] = int(prize[0] * percent_tx_remain)
+    game_prizes = []
+    for p, orig in zip(prizes, combined_prizes):
+        prize = {}
+        prize["available"] = p[0]
+        prize["claimed"] = orig[0] - p[0]
+        prize["value"] = p[1]
+        prize["prize"] = locale.currency(p[1], grouping=True)
+        game_prizes.append(prize)
+    game["prizes"] = game_prizes
+    return game
+
+
+def main():
+    index_html = fetch_html(INDEX_URL)
+    game_urls = find_game_urls(index_html)
+    game_names = find_game_names(index_html)
+    # Data will be a list of tuples that looks like:
+    # [(Ticket Price, Game Name, Expected Value), ...]
+    #
+    # The first element of the tuple of the list comprehension below
+    # is kind of confusing. We are iterating over game urls.
+    # We first fetch the html for the game url. Then we find the
+    # game rules url in that page. Then we fetch the html of the game rules
+    # page, then we find the price from that html.
+    # Hence:
+    #     `find_price(fetch_html(find_complete_game_rules_url(fetch_html(url))))`
+    games = []
+
+    for name, url in list(zip(game_names, game_urls)):
+        try:
+            game_html = fetch_html(url)
+        except Exception as e:
+            logger.error("Error fetching %s: %s", url, e)
+            continue
+        try:
+            games.append(parse_game_html(name, url, game_html))
+        except Exception as e:
+            t, b, tb = sys.exc_info()
+            tb_msg = "\n".join(traceback.format_tb(tb))
+            logger.error("Unable to parse game {}.\n{}\n{}".format(name, e, tb_msg))
+
+    return games
+
+
+if __name__ == "__main__":
+    games = main()
+    schema = GameSchema(many=True)
+    print(schema.dumps(games))
diff --git a/lottery_data_scraper/schemas.py b/lottery_data_scraper/schemas.py
new file mode 100644
index 0000000..dd4b6d6
--- /dev/null
+++ b/lottery_data_scraper/schemas.py
@@ -0,0 +1,76 @@
+"""Some marshmallow schemas to do data validation and serialization.
+
+How to use:
+
+Create your model as a plain old Python object.
+
+Example:
+
+    game = {}
+    game["game_id"] = "5"
+    game["price"] = 30
+    game["state"] = "tx"
+
+Then create an instance of the schema.
+
+    schema = GameSchema()
+
+Call `schema.dumps(game)` to "dump" your Python object to a string in JSON
+format.
+
+    >>> game = {"game_id": "5", "price": 30, "state": "tx", "created_at": datetime.utcnow()}
+    >>> schema = GameSchema()
+    >>> schema.dumps(game)
+    '{"game_id": "5", "state": "tx", "created_at": "2023-04-08T05:58:49.494561", "price": 30.0, "image_urls": "[]"}'
+
+And you can load a JSON string into a Python object with `schema.loads`.
+
+    >>> schema.loads(schema.dumps(game))
+    {'game_id': '5', 'state': 'tx', 'created_at': datetime.datetime(2023, 4, 8, 5, 58, 49, 494561), 'price': 30.0, 'image_urls': []}
+
+Some fields, like `game_id`, are required. You can validate a Python object by calling `schema.validate`.
+
+    >>> game = {"price": 30, "state": "tx", "created_at": datetime.utcnow()}
+    >>> schema.dumps(game)
+    '{"state": "tx", "created_at": "2023-04-08T06:02:32.126541", "price": 30.0, "image_urls": "[]"}'
+    >>> schema.validate(game)
+    {'created_at': ['Not a valid datetime.']}
+"""
+from datetime import datetime
+import json
+from marshmallow import Schema, fields
+
+
+class PrizeSchema(Schema):
+    class Meta:
+        render_module = json
+
+    id = fields.Integer()
+    game_id = fields.Integer()
+    available = fields.Integer()
+    claimed = fields.Integer()
+    created_at = fields.DateTime(load_default=datetime.utcnow)
+    value = fields.Number()
+    prize = fields.Str()
+
+
+class GameSchema(Schema):
+    class Meta:
+        render_module = json
+
+    id = fields.Integer()
+    created_at = fields.DateTime(load_default=datetime.utcnow)
+    game_id = fields.Str(required=True)
+    name = fields.Str()
+    description = fields.Str()
+    image_urls = fields.Function(
+        lambda x: json.dumps(x.get("image_urls", [])),
+        deserialize=lambda x: json.loads(x),
+    )
+    how_to_play = fields.Str()
+    num_tx_initial = fields.Integer()
+    price = fields.Number()
+    prizes = fields.Nested(PrizeSchema, many=True)
+    state = fields.Str()
+    updated_at = fields.DateTime()
+    url = fields.Str()
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..3d1db5f
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,28 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="lottery_data_scraper",
+    version="0.0.1",
+    author="Eric Ihli",
+    author_email="eihli@owoga.com",
+    url="https://github.com/owogac/lottery_data_scraper",
+    packages=find_packages(),
+    install_requires=[
+        "beautifulsoup4",
+        "requests==2.28.2",
+        "urllib3==1.26.15",
+        "numpy",
+        "pandas",
+        "lxml",
+        "html2text",
+        "html5lib",
+        "marshmallow==3.19.0",
+        "selenium==3.141.0",
+        "pybind11",
+        # If you want to develop locally and don't want to mess around with
+        # Xvfb (https://en.wikipedia.org/wiki/Xvfb), then just comment out
+        # the next line before you run `python3 setup.py install`.
+        "xvfbwrapper==0.2.9",
+        "table_ocr==0.2.5",
+    ],
+)
diff --git a/tests/test_pennsylvania.py b/tests/test_pennsylvania.py
new file mode 100644
index 0000000..7737299
--- /dev/null
+++ b/tests/test_pennsylvania.py
@@ -0,0 +1,23 @@
+import unittest
+import requests
+
+from lottery_data_scraper import pennsylvania
+from lottery_data_scraper import schemas
+
+
+class TestPennsylvania(unittest.TestCase):
+    def test_parse_game_html(self):
+        # URL chosen arbitrarily
+        url = "https://www.palottery.state.pa.us/Scratch-Offs/View-Scratch-Off.aspx?id=3201"
+        html = pennsylvania.fetch_html(url)
+        game = pennsylvania.parse_game_html("$3 Million Mega Stacks", url, html)
+        self.assertEqual(game["name"], "$3 Million Mega Stacks")
+        self.assertEqual(game["price"], 30)
+        self.assertEqual(
+            game["url"],
+            "https://www.palottery.state.pa.us/Scratch-Offs/View-Scratch-Off.aspx?id=3201",
+        )
+        self.assertEqual(game["game_id"], "3201")
+        self.assertEqual(game["prizes"][0]["prize"], "$3,000,000.00")
+        # Perhaps unfortunately in dollars. Cents would be better, eh?
+        self.assertEqual(game["prizes"][0]["value"], 3000000)