Add parser for Pennsylvania
commit
fdafaae267
@ -0,0 +1,2 @@
|
|||||||
|
*.egg-info/
|
||||||
|
*.pyc
|
@ -0,0 +1,12 @@
|
|||||||
|
# Changelog
|
||||||
|
|
||||||
|
All notable changes to this project will be documented in this file.
|
||||||
|
|
||||||
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- Parser for Pennsylvania.
|
@ -0,0 +1,15 @@
|
|||||||
|
##
|
||||||
|
# Lottery Data Scraper
|
||||||
|
#
|
||||||
|
# @file
|
||||||
|
# @version 0.1
|
||||||
|
|
||||||
|
FORCE:
|
||||||
|
|
||||||
|
test: FORCE
|
||||||
|
python3 -m unittest discover tests
|
||||||
|
|
||||||
|
style: FORCE
|
||||||
|
black .
|
||||||
|
|
||||||
|
# end
|
@ -0,0 +1,101 @@
|
|||||||
|
# Parsing of lottery websites
|
||||||
|
|
||||||
|
## Demo
|
||||||
|
|
||||||
|
The following script should put you in a state where the last line will make a
|
||||||
|
bunch of requests to the Pennsylvania lottery website, parse the tables of
|
||||||
|
games/prizes, and print to your terminal a JSON structure of all of the games.
|
||||||
|
|
||||||
|
``` sh
|
||||||
|
git clone https://github.com/owogawc/lottery_data_scraper
|
||||||
|
cd lottery_data_scraper
|
||||||
|
python3 -m venv ~/.virtualenvs/lottery_data_scraper
|
||||||
|
. ~/.virtualenvs/lottery_data_scraper
|
||||||
|
pip3 install -e .
|
||||||
|
|
||||||
|
PY_LOG_LVL=DEBUG USE_CACHE=true python3 -m lottery_data_scraper.pennsylvania
|
||||||
|
```
|
||||||
|
|
||||||
|
If you have [jq](https://stedolan.github.io/jq/) installed, you can get some
|
||||||
|
formatted output by piping it to `jq` (and redirecting STDERR to /dev/null).
|
||||||
|
|
||||||
|
``` sh
|
||||||
|
PY_LOG_LVL=DEBUG USE_CACHE=true python3 -m lottery_data_scraper.pennsylvania 2> /dev/null | jq
|
||||||
|
```
|
||||||
|
|
||||||
|
## Data models
|
||||||
|
|
||||||
|
We're using [`marshmallow`](https://marshmallow.readthedocs.io/en/stable/index.html) to validate and serialize data.
|
||||||
|
|
||||||
|
I'm including the schemas here just so you can quickly get a general idea of
|
||||||
|
what data fields we're able to scrape from most lottery websites. What you see
|
||||||
|
in this README might not be up-to-date with what's in
|
||||||
|
[schemas.py](./lottery_data_scraper/schemas.py).
|
||||||
|
|
||||||
|
As of 2023-04-07 the schemas are a work-in-progress. The remaining TODO is to
|
||||||
|
determine and specify which fields are absolutely required and which are
|
||||||
|
optional.
|
||||||
|
|
||||||
|
### Game Schema
|
||||||
|
|
||||||
|
``` python
|
||||||
|
class GameSchema(Schema):
|
||||||
|
class Meta:
|
||||||
|
render_module = json
|
||||||
|
|
||||||
|
id = fields.Integer()
|
||||||
|
created_at = fields.DateTime(load_default=datetime.utcnow)
|
||||||
|
game_id = fields.Str()
|
||||||
|
name = fields.Str()
|
||||||
|
description = fields.Str()
|
||||||
|
image_urls = fields.Function(
|
||||||
|
lambda x: json.loads(x.image_urls) if x.image_urls else [],
|
||||||
|
deserialize=lambda x: None if x.image_urls == [] else json.dumps(x.image_urls),
|
||||||
|
)
|
||||||
|
how_to_play = fields.Str()
|
||||||
|
num_tx_initial = fields.Integer()
|
||||||
|
price = fields.Number()
|
||||||
|
prizes = fields.Nested(PrizeSchema, many=True)
|
||||||
|
state = fields.Str()
|
||||||
|
updated_at = fields.DateTime()
|
||||||
|
url = fields.Str()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Prize Schema
|
||||||
|
|
||||||
|
``` python
|
||||||
|
class PrizeSchema(Schema):
|
||||||
|
class Meta:
|
||||||
|
render_module = json
|
||||||
|
|
||||||
|
id = fields.Integer()
|
||||||
|
game_id = fields.Integer()
|
||||||
|
available = fields.Integer()
|
||||||
|
claimed = fields.Integer()
|
||||||
|
created_at = fields.DateTime(load_default=datetime.utcnow)
|
||||||
|
value = fields.Number()
|
||||||
|
prize = fields.Str()
|
||||||
|
```
|
||||||
|
|
||||||
|
# Tests
|
||||||
|
|
||||||
|
Testing is kind of tricky because you can't rely on _just_ python with its
|
||||||
|
`requests` library. Some states have some scrape protections that require you
|
||||||
|
actually run JavaScript. Some states have extreme scrape protection that require
|
||||||
|
you to actually run a _display_. They check for some rendering context that
|
||||||
|
doesn't exist when you run a headless browser in Selenium. To scrape those
|
||||||
|
sites, you actually have to run a [X virtual
|
||||||
|
framebuffer](https://en.wikipedia.org/wiki/Xvfb). Testing in these cases isn't
|
||||||
|
as simple as running `python3 -m unittest discover`.
|
||||||
|
|
||||||
|
# Contributing
|
||||||
|
|
||||||
|
``` sh
|
||||||
|
git clone https://github.com/owogawc/lottery_data_scraper
|
||||||
|
cd lottery_data_scraper
|
||||||
|
python3 -m venv ~/.virtualenvs/lottery_data_scraper
|
||||||
|
. ~/.virtualenvs/lottery_data_scraper
|
||||||
|
pip3 install -e .
|
||||||
|
```
|
||||||
|
|
||||||
|
Then you should be able to run `make test` and see the tests pass.
|
@ -0,0 +1,12 @@
|
|||||||
|
# TODO
|
||||||
|
|
||||||
|
- [ ] Add more states.
|
||||||
|
- [ ] California
|
||||||
|
- [ ] Georgia
|
||||||
|
- [ ] Louisiana
|
||||||
|
- [ ] Florida
|
||||||
|
- [ ] Texas
|
||||||
|
- [ ] New Mexico
|
||||||
|
- [ ] [Publish to PYPI](https://packaging.python.org/en/latest/tutorials/packaging-projects/).
|
||||||
|
- [ ] Do we still need Xvfb? Which states have that level of scrape protection?
|
||||||
|
- [ ] Decide on and add a license.
|
@ -0,0 +1,40 @@
|
|||||||
|
"""
|
||||||
|
Configure logging for the entire package.
|
||||||
|
|
||||||
|
You can specify a log level with the environment variable
|
||||||
|
PY_LOG_LVL=[debug|info|warning|error|critical]
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
import logging.config
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
# Prefix the basic format with a timestamp, file pathname, and line number.
|
||||||
|
# See: https://docs.python.org/3/library/logging.html#logrecord-attributes
|
||||||
|
LOG_FORMAT = "%(asctime)s %(pathname)s %(lineno)s {}".format(logging.BASIC_FORMAT)
|
||||||
|
|
||||||
|
log_level = getattr(logging, os.environ.get("PY_LOG_LVL", "WARNING").upper())
|
||||||
|
logging_config = {
|
||||||
|
"version": 1,
|
||||||
|
"formatters": {
|
||||||
|
"standard": {
|
||||||
|
"format": LOG_FORMAT,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"handlers": {
|
||||||
|
"default": {
|
||||||
|
"level": log_level,
|
||||||
|
"formatter": "standard",
|
||||||
|
"class": "logging.StreamHandler",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"loggers": {
|
||||||
|
"": {
|
||||||
|
"handlers": ["default"],
|
||||||
|
"level": log_level,
|
||||||
|
"propagate": True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
logging.config.dictConfig(logging_config)
|
@ -0,0 +1,385 @@
|
|||||||
|
"""
|
||||||
|
Scrapes the Pennsylvania lottery website for scratch-off ticket
|
||||||
|
data and calculates the expected value for each game.
|
||||||
|
|
||||||
|
Pennsylvania publishes the number of tickets printed and how many
|
||||||
|
tickets are printed at each prize level.
|
||||||
|
|
||||||
|
We can calculated the expected value of a game by summing
|
||||||
|
the value of all the prizes and dividing that by the cost
|
||||||
|
of all the tickets.
|
||||||
|
|
||||||
|
The palottery website has an "index" page that has links to every game.
|
||||||
|
Each individual game has a link to a "game rules" page.
|
||||||
|
We can start at the index and visit every game rules page, then we
|
||||||
|
can find the html table on that page which has the detailed prize
|
||||||
|
information and run our calculations.
|
||||||
|
|
||||||
|
Website that we'll be scraping:
|
||||||
|
https://www.palottery.state.pa.us/Scratch-Offs/Active-Games.aspx
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
python -m pennsylvania
|
||||||
|
Or:
|
||||||
|
LOGLEVEL=DEBUG USE_CACHE=True python -m pennsylvania
|
||||||
|
|
||||||
|
The following behavior is configurable through shell environment variables.
|
||||||
|
|
||||||
|
Set LOGLEVEL to print useful debug info to console.
|
||||||
|
LOGLEVEL=[DEBUG,INFO,WARNING,ERROR,CRITICAL]
|
||||||
|
Defaults to WARNING.
|
||||||
|
|
||||||
|
Set USE_CACHE to cache responses. This speeds up development
|
||||||
|
and is nice to the servers we're hitting.
|
||||||
|
USE_CACHE=[True]
|
||||||
|
Defaults to False. Note: Setting this env variable to the string False
|
||||||
|
will cause it to use cache because the string "False" evaluates to Truthy.
|
||||||
|
Either set it to True or don't set it.
|
||||||
|
"""
|
||||||
|
import base64
|
||||||
|
import sys
|
||||||
|
import traceback
|
||||||
|
from copy import deepcopy
|
||||||
|
import locale
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from tempfile import gettempdir
|
||||||
|
from bs4 import BeautifulSoup as bs
|
||||||
|
import requests
|
||||||
|
from lottery_data_scraper.schemas import GameSchema
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
locale.setlocale(locale.LC_MONETARY, "en_US.UTF-8")
|
||||||
|
|
||||||
|
# It's worth assigning to constants values that are used in many
|
||||||
|
# places throughout a script.
|
||||||
|
BASE_URL = "https://www.palottery.state.pa.us"
|
||||||
|
INDEX_URL = f"{BASE_URL}/Scratch-Offs/Active-Games.aspx"
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_html(url):
|
||||||
|
"""
|
||||||
|
Helper to fetch and cache html responses.
|
||||||
|
|
||||||
|
During development and while testing, we'll be hitting the same urls often.
|
||||||
|
The content of the pages probably won't be changing.
|
||||||
|
Caching the results will speed up development,
|
||||||
|
and the servers will appreciate us for not spamming requests.
|
||||||
|
|
||||||
|
The responses are cached in the operating systems tempfile directory.
|
||||||
|
That's probably /tmp/ or /var/tmp/ on Unix flavors and C:/temp/ on Windows.
|
||||||
|
The filename is based on the URL. But since the URL might contain
|
||||||
|
characters that are invalid for filenames, we base64 encode the URL.
|
||||||
|
"""
|
||||||
|
safe_filename = base64.urlsafe_b64encode(bytes(url, "utf-8")).decode("utf-8")
|
||||||
|
filepath = os.path.join(gettempdir(), safe_filename)
|
||||||
|
|
||||||
|
if os.path.isfile(filepath) and os.environ.get("USE_CACHE", False):
|
||||||
|
with open(filepath, "r") as f:
|
||||||
|
return f.read()
|
||||||
|
else:
|
||||||
|
# We are relying on the outside world when we make a request, so we
|
||||||
|
# might want to wrap this in a try/except. But we'd
|
||||||
|
# only want to do that in two cases.
|
||||||
|
#
|
||||||
|
# 1. We have a way of handling exceptions,
|
||||||
|
# A good example would be to catch exceptions and retry the
|
||||||
|
# request; maybe the network was down.
|
||||||
|
#
|
||||||
|
# 2. We can't handle the exception, but we want to log something
|
||||||
|
# more useful than the stack trace that will get spit out if
|
||||||
|
# we just let the exception go uncaught.
|
||||||
|
#
|
||||||
|
# In this case, I don't think it's worth muddying up the code
|
||||||
|
# trying to handle exceptions here. It's easy enough to just re-run
|
||||||
|
# the script.
|
||||||
|
html = requests.get(url).text
|
||||||
|
if os.environ.get("USE_CACHE", False):
|
||||||
|
with open(filepath, "w+") as f:
|
||||||
|
f.write(html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
def find_game_names(html):
|
||||||
|
"""
|
||||||
|
Game names can be found on the index page
|
||||||
|
in the text of anchor elements
|
||||||
|
which have the class "activeGame_li".
|
||||||
|
"""
|
||||||
|
soup = bs(html, "lxml")
|
||||||
|
game_elements = soup.find_all("a", class_="activeGame_li")
|
||||||
|
return [
|
||||||
|
re.sub(r"\s+", " ", g.find("div", class_="info").text) for g in game_elements
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def find_game_urls(html):
|
||||||
|
"""
|
||||||
|
Luckily, all of the Pennsylvania games are listed on a single html page.
|
||||||
|
We don't have to mess around with any pagination and making multiple requests.
|
||||||
|
|
||||||
|
The links are "href" attributes of anchor tags with the class "activeGame_li".
|
||||||
|
"""
|
||||||
|
soup = bs(html, "lxml")
|
||||||
|
game_elements = soup.find_all("a", class_="activeGame_li")
|
||||||
|
return ["{}{}".format(BASE_URL, e.attrs["href"]) for e in game_elements]
|
||||||
|
|
||||||
|
|
||||||
|
def find_complete_game_rules_url(html):
|
||||||
|
"""
|
||||||
|
Game pages have a link to the complete game rules.
|
||||||
|
The complete game rules have a table of all prizes for a game.
|
||||||
|
|
||||||
|
The link to the game rules page is in an anchor tag
|
||||||
|
nested under a div with the class "instant-games-games-info".
|
||||||
|
"""
|
||||||
|
soup = bs(html, "lxml")
|
||||||
|
games_info_div = soup.find("div", class_="instant-games-games-info")
|
||||||
|
games_info_anchor = games_info_div.find_all("a")[1]
|
||||||
|
games_info_url = games_info_anchor.attrs["href"]
|
||||||
|
return games_info_url
|
||||||
|
|
||||||
|
|
||||||
|
def find_rows(html):
|
||||||
|
"""
|
||||||
|
From a game rules page, find the rows of the table
|
||||||
|
that have the number of tickets and the value of each prize.
|
||||||
|
"""
|
||||||
|
soup = bs(html, "lxml")
|
||||||
|
|
||||||
|
# Some game rules pages have multiple tables.
|
||||||
|
# The first table has the prizes.
|
||||||
|
# soup.find returns the first matching element
|
||||||
|
# soup.find_all returns a list of all matching elements.
|
||||||
|
prize_table = soup.find("table")
|
||||||
|
row_elements = prize_table.find_all("tr")
|
||||||
|
|
||||||
|
# The first row is headers so we sort of want
|
||||||
|
# to skip it for the calculations, but it includes
|
||||||
|
# an important bit of information that we want.
|
||||||
|
# The rows only contain winning ticket info.
|
||||||
|
# We also care about a row for the losing prize tier.
|
||||||
|
# It will have a value of "0" but we want to know
|
||||||
|
# how many losing tickets there are.
|
||||||
|
#
|
||||||
|
# We can calculate that from the first header. It
|
||||||
|
# contains the total number of tickets printed.
|
||||||
|
# Let's get the total number of tickets printed so
|
||||||
|
# we can subtract the sum of the number of winning
|
||||||
|
# giving us the number of losing tickets.
|
||||||
|
header_row = row_elements[0]
|
||||||
|
header_columns = header_row.find_all("th")
|
||||||
|
total_number_tickets = int(re.sub(r"\D", "", header_columns[-1].text))
|
||||||
|
|
||||||
|
row_elements = row_elements[1:]
|
||||||
|
|
||||||
|
# We only care about the last and second to last columns.
|
||||||
|
# The following helper functions will help us parse
|
||||||
|
# the data we care about from each row.
|
||||||
|
#
|
||||||
|
# The last column is the number of tickets at this prize level.
|
||||||
|
# The number of tickets has commas, like 1,350,500.
|
||||||
|
# We'll have to parse them out.
|
||||||
|
#
|
||||||
|
# The second to last column is the prize value.
|
||||||
|
# Prize value is usually "$" followed by a number.
|
||||||
|
# Those are easy to parse.
|
||||||
|
# But for the free ticket prize it's "FREE $1 TICKET"
|
||||||
|
def parse_value(row_element):
|
||||||
|
columns = row_element.find_all("td")
|
||||||
|
try:
|
||||||
|
value_element = columns[-3]
|
||||||
|
value_text = value_element.text
|
||||||
|
return int(re.sub(r"\D", "", value_text))
|
||||||
|
except Exception:
|
||||||
|
# This is an exception we can handle.
|
||||||
|
# We can simply return a value of 0 if
|
||||||
|
# the row doesn't have what we expect.
|
||||||
|
# Our result might be inaccurate, but
|
||||||
|
# I'll consider that acceptable.
|
||||||
|
# I'll log something useful so I know
|
||||||
|
# to look into it.
|
||||||
|
logger.warning("Exception parsing value for a row :%s", row_element.text)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def parse_num_tickets(row_element):
|
||||||
|
columns = row_element.find_all("td")
|
||||||
|
try:
|
||||||
|
num_tickets_element = columns[-1]
|
||||||
|
num_tickets_text = num_tickets_element.text
|
||||||
|
return int(num_tickets_text.replace(",", ""))
|
||||||
|
except:
|
||||||
|
# Same as above, we can handle this.
|
||||||
|
# Logging and returning 0 is better than blowing up.
|
||||||
|
logger.warning(
|
||||||
|
"Exception parsing num_tickets for a row.\n{}".format(row_element.text)
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Iterate over each row and parse out the value of the prize tier
|
||||||
|
# and the number of remaining tickets at that prize tier.
|
||||||
|
rows = [(parse_value(e), parse_num_tickets(e)) for e in row_elements]
|
||||||
|
number_winning_tickets = sum(r[1] for r in rows)
|
||||||
|
|
||||||
|
# Insert the losing ticket value, $0, and the number
|
||||||
|
# of losing tickets into our rows.
|
||||||
|
rows.insert(0, (0, total_number_tickets - number_winning_tickets))
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def find_price(html):
|
||||||
|
"""
|
||||||
|
Price is hard to find. It seems to always be a sibling to an
|
||||||
|
<i> tag which has the text "Price". So, we can find that <i>
|
||||||
|
tag, get the text of it's parent, find the last word of that text,
|
||||||
|
and that will be the price of the ticket as a string that looks like
|
||||||
|
"$10.", which we can then strip of the non-digits.
|
||||||
|
"""
|
||||||
|
soup = bs(html, "lxml")
|
||||||
|
price_element = soup.find(string="Price")
|
||||||
|
price_text = price_element.parent.parent.text.split(" ")[-1]
|
||||||
|
price = int(re.sub(r"\D", "", price_text))
|
||||||
|
return price
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_original_ev(game_url):
|
||||||
|
"""
|
||||||
|
The "expected value" or "return on investment" of a game
|
||||||
|
will be the total value of the remaining prizes
|
||||||
|
divided by the total cost of the remaining tickets.
|
||||||
|
|
||||||
|
Imagine you bought every ticket that was printed.
|
||||||
|
|
||||||
|
How much money would you spend? How much money would you get back in prizes?
|
||||||
|
|
||||||
|
If you won $1,500,000 and spent $2,000,000
|
||||||
|
then your expected value is 1,500,000 / 2,000,000 = 0.75.
|
||||||
|
|
||||||
|
For every $1 spent on the game, you'll get back $0.75
|
||||||
|
for an average loss of $0.25.
|
||||||
|
"""
|
||||||
|
game_html = fetch_html(game_url)
|
||||||
|
game_rules_url = find_complete_game_rules_url(game_html)
|
||||||
|
game_rules_html = fetch_html(game_rules_url)
|
||||||
|
price = find_price(game_rules_html)
|
||||||
|
rows = find_rows(game_rules_html)
|
||||||
|
total_number_tickets = sum(r[1] for r in rows)
|
||||||
|
total_value_tickets = sum(r[1] * r[0] for r in rows)
|
||||||
|
total_cost_tickets = total_number_tickets * price
|
||||||
|
ev = total_value_tickets / total_cost_tickets
|
||||||
|
return ev
|
||||||
|
|
||||||
|
|
||||||
|
def combine_prizes(prizes):
|
||||||
|
combined = []
|
||||||
|
last_prize = prizes[0]
|
||||||
|
for prize in prizes[1:]:
|
||||||
|
if last_prize[-1] == prize[-1]:
|
||||||
|
last_prize[0] += prize[0]
|
||||||
|
else:
|
||||||
|
combined.append(last_prize)
|
||||||
|
last_prize = prize
|
||||||
|
combined.append(last_prize)
|
||||||
|
return combined
|
||||||
|
|
||||||
|
|
||||||
|
def parse_game_html(name, url, html):
|
||||||
|
game = {}
|
||||||
|
game_soup = bs(html, "lxml")
|
||||||
|
game["name"] = name.strip()
|
||||||
|
game["url"] = url
|
||||||
|
game["game_id"] = re.match(r".*?(\d+$)", url).group(1)
|
||||||
|
game_rules_url = find_complete_game_rules_url(html)
|
||||||
|
game_rules_html = fetch_html(game_rules_url)
|
||||||
|
game_rules_soup = bs(game_rules_html, "lxml")
|
||||||
|
game["price"] = find_price(game_rules_html)
|
||||||
|
prize_table = game_rules_soup.find("table", class_="miscr")
|
||||||
|
|
||||||
|
def prize_value(p, price):
|
||||||
|
p = p.text.strip()
|
||||||
|
if re.search(r"FREE", p):
|
||||||
|
return price
|
||||||
|
else:
|
||||||
|
return p.replace("$", "").replace(",", "")
|
||||||
|
|
||||||
|
prize_tuples = [
|
||||||
|
[
|
||||||
|
int(tds[-1].text.replace(",", "").strip()),
|
||||||
|
float(tds[-2].text.replace(",", "").strip()),
|
||||||
|
float(prize_value(tds[-3], game["price"])),
|
||||||
|
# float(tds[-3].text.replace("$", "").replace(",", "").strip()),
|
||||||
|
]
|
||||||
|
for tds in [tr.find_all("td") for tr in prize_table.find_all("tr")[1:]]
|
||||||
|
]
|
||||||
|
game["num_tx_initial"] = prize_tuples[-1][0] * prize_tuples[-1][1]
|
||||||
|
game["state"] = "pa"
|
||||||
|
combined_prizes = sorted(combine_prizes(deepcopy(prize_tuples)), key=lambda x: x[2])
|
||||||
|
prizes_remaining_table = game_soup.find("table", class_="table-global").find(
|
||||||
|
"tbody"
|
||||||
|
)
|
||||||
|
prizes_remaining = [
|
||||||
|
[
|
||||||
|
int(tds[1].text.strip()),
|
||||||
|
float(tds[0].text.replace("$", "").replace(",", "").strip()),
|
||||||
|
]
|
||||||
|
for tds in [tr.find_all("td") for tr in prizes_remaining_table.find_all("tr")]
|
||||||
|
]
|
||||||
|
percent_tx_remain = sum(p[0] for p in prizes_remaining) / sum(
|
||||||
|
p[0] for p in combined_prizes[: -len(prizes_remaining) - 1 : -1]
|
||||||
|
)
|
||||||
|
combined_prizes = sorted(
|
||||||
|
[[p[0], p[2]] for p in combined_prizes], key=lambda x: -x[1]
|
||||||
|
)
|
||||||
|
prizes = sorted(deepcopy(combined_prizes), key=lambda x: -x[1])
|
||||||
|
prizes[: len(prizes_remaining)] = prizes_remaining
|
||||||
|
for prize in prizes[len(prizes_remaining) :]:
|
||||||
|
prize[0] = int(prize[0] * percent_tx_remain)
|
||||||
|
game_prizes = []
|
||||||
|
for p, orig in zip(prizes, combined_prizes):
|
||||||
|
prize = {}
|
||||||
|
prize["available"] = p[0]
|
||||||
|
prize["claimed"] = orig[0] - p[0]
|
||||||
|
prize["value"] = p[1]
|
||||||
|
prize["prize"] = locale.currency(p[1], grouping=True)
|
||||||
|
game_prizes.append(prize)
|
||||||
|
game["prizes"] = game_prizes
|
||||||
|
return game
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
index_html = fetch_html(INDEX_URL)
|
||||||
|
game_urls = find_game_urls(index_html)
|
||||||
|
game_names = find_game_names(index_html)
|
||||||
|
# Data will be a list of tuples that looks like:
|
||||||
|
# [(Ticket Price, Game Name, Expected Value), ...]
|
||||||
|
#
|
||||||
|
# The first element of the tuple of the list comprehension below
|
||||||
|
# is kind of confusing. We are iterating over game urls.
|
||||||
|
# We first fetch the html for the game url. Then we find the
|
||||||
|
# game rules url in that page. Then we fetch the html of the game rules
|
||||||
|
# page, then we find the price from that html.
|
||||||
|
# Hence:
|
||||||
|
# `find_price(fetch_html(find_complete_game_rules_url(fetch_html(url))))`
|
||||||
|
games = []
|
||||||
|
|
||||||
|
for name, url in list(zip(game_names, game_urls)):
|
||||||
|
try:
|
||||||
|
game_html = fetch_html(url)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Error fetching %s: %s", url, e)
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
games.append(parse_game_html(name, url, game_html))
|
||||||
|
except Exception as e:
|
||||||
|
t, b, tb = sys.exc_info()
|
||||||
|
tb_msg = "\n".join(traceback.format_tb(tb))
|
||||||
|
logger.error("Unable to parse game {}.\n{}\n{}".format(name, e, tb_msg))
|
||||||
|
|
||||||
|
return games
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
games = main()
|
||||||
|
schema = GameSchema(many=True)
|
||||||
|
print(schema.dumps(games))
|
@ -0,0 +1,76 @@
|
|||||||
|
"""Some marshmallow schemas to do data validation and serialization.
|
||||||
|
|
||||||
|
How to use:
|
||||||
|
|
||||||
|
Create your model as a plain old Python object.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
game = {}
|
||||||
|
game["game_id"] = "5"
|
||||||
|
game["price"] = 30
|
||||||
|
game["state"] = "tx"
|
||||||
|
|
||||||
|
Then create an instance of the schema.
|
||||||
|
|
||||||
|
schema = GameSchema()
|
||||||
|
|
||||||
|
Call `schema.dumps(game)` to "dump" your Python object to a string in JSON
|
||||||
|
format.
|
||||||
|
|
||||||
|
>>> game = {"game_id": "5", "price": 30, "state": "tx", "created_at": datetime.utcnow()}
|
||||||
|
>>> schema = GameSchema()
|
||||||
|
>>> schema.dumps(game)
|
||||||
|
'{"game_id": "5", "state": "tx", "created_at": "2023-04-08T05:58:49.494561", "price": 30.0, "image_urls": "[]"}'
|
||||||
|
|
||||||
|
And you can load a JSON string into a Python object with `schema.loads`.
|
||||||
|
|
||||||
|
>>> schema.loads(schema.dumps(game))
|
||||||
|
{'game_id': '5', 'state': 'tx', 'created_at': datetime.datetime(2023, 4, 8, 5, 58, 49, 494561), 'price': 30.0, 'image_urls': []}
|
||||||
|
|
||||||
|
Some fields, like `game_id`, are required. You can validate a Python object by calling `schema.validate`.
|
||||||
|
|
||||||
|
>>> game = {"price": 30, "state": "tx", "created_at": datetime.utcnow()}
|
||||||
|
>>> schema.dumps(game)
|
||||||
|
'{"state": "tx", "created_at": "2023-04-08T06:02:32.126541", "price": 30.0, "image_urls": "[]"}'
|
||||||
|
>>> schema.validate(game)
|
||||||
|
{'created_at': ['Not a valid datetime.']}
|
||||||
|
"""
|
||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
from marshmallow import Schema, fields
|
||||||
|
|
||||||
|
|
||||||
|
class PrizeSchema(Schema):
|
||||||
|
class Meta:
|
||||||
|
render_module = json
|
||||||
|
|
||||||
|
id = fields.Integer()
|
||||||
|
game_id = fields.Integer()
|
||||||
|
available = fields.Integer()
|
||||||
|
claimed = fields.Integer()
|
||||||
|
created_at = fields.DateTime(load_default=datetime.utcnow)
|
||||||
|
value = fields.Number()
|
||||||
|
prize = fields.Str()
|
||||||
|
|
||||||
|
|
||||||
|
class GameSchema(Schema):
|
||||||
|
class Meta:
|
||||||
|
render_module = json
|
||||||
|
|
||||||
|
id = fields.Integer()
|
||||||
|
created_at = fields.DateTime(load_default=datetime.utcnow)
|
||||||
|
game_id = fields.Str(required=True)
|
||||||
|
name = fields.Str()
|
||||||
|
description = fields.Str()
|
||||||
|
image_urls = fields.Function(
|
||||||
|
lambda x: json.dumps(x.get("image_urls", [])),
|
||||||
|
deserialize=lambda x: json.loads(x),
|
||||||
|
)
|
||||||
|
how_to_play = fields.Str()
|
||||||
|
num_tx_initial = fields.Integer()
|
||||||
|
price = fields.Number()
|
||||||
|
prizes = fields.Nested(PrizeSchema, many=True)
|
||||||
|
state = fields.Str()
|
||||||
|
updated_at = fields.DateTime()
|
||||||
|
url = fields.Str()
|
@ -0,0 +1,28 @@
|
|||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="lottery_data_scraper",
|
||||||
|
version="0.0.1",
|
||||||
|
author="Eric Ihli",
|
||||||
|
author_email="eihli@owoga.com",
|
||||||
|
url="https://github.com/owogac/lottery_data_scraper",
|
||||||
|
packages=find_packages(),
|
||||||
|
install_requires=[
|
||||||
|
"beautifulsoup4",
|
||||||
|
"requests==2.28.2",
|
||||||
|
"urllib3==1.26.15",
|
||||||
|
"numpy",
|
||||||
|
"pandas",
|
||||||
|
"lxml",
|
||||||
|
"html2text",
|
||||||
|
"html5lib",
|
||||||
|
"marshmallow==3.19.0",
|
||||||
|
"selenium==3.141.0",
|
||||||
|
"pybind11",
|
||||||
|
# If you want to develop locally and don't want to mess around with
|
||||||
|
# Xvfb (https://en.wikipedia.org/wiki/Xvfb), then just comment out
|
||||||
|
# the next line before you run `python3 setup.py install`.
|
||||||
|
"xvfbwrapper==0.2.9",
|
||||||
|
"table_ocr==0.2.5",
|
||||||
|
],
|
||||||
|
)
|
@ -0,0 +1,23 @@
|
|||||||
|
import unittest
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from lottery_data_scraper import pennsylvania
|
||||||
|
from lottery_data_scraper import schemas
|
||||||
|
|
||||||
|
|
||||||
|
class TestPennsylvania(unittest.TestCase):
|
||||||
|
def test_parse_game_html(self):
|
||||||
|
# URL chosen arbitrarily
|
||||||
|
url = "https://www.palottery.state.pa.us/Scratch-Offs/View-Scratch-Off.aspx?id=3201"
|
||||||
|
html = pennsylvania.fetch_html(url)
|
||||||
|
game = pennsylvania.parse_game_html("$3 Million Mega Stacks", url, html)
|
||||||
|
self.assertEqual(game["name"], "$3 Million Mega Stacks")
|
||||||
|
self.assertEqual(game["price"], 30)
|
||||||
|
self.assertEqual(
|
||||||
|
game["url"],
|
||||||
|
"https://www.palottery.state.pa.us/Scratch-Offs/View-Scratch-Off.aspx?id=3201",
|
||||||
|
)
|
||||||
|
self.assertEqual(game["game_id"], "3201")
|
||||||
|
self.assertEqual(game["prizes"][0]["prize"], "$3,000,000.00")
|
||||||
|
# Perhaps unfortunately in dollars. Cents would be better, eh?
|
||||||
|
self.assertEqual(game["prizes"][0]["value"], 3000000)
|
Loading…
Reference in New Issue