From abf7c1f103742ff0534631179fce2c4b93a38e4e Mon Sep 17 00:00:00 2001 From: tdhood Date: Thu, 20 Apr 2023 09:19:24 -0700 Subject: [PATCH 1/6] texas added --- lottery_data_scraper/texas.py | 140 ++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 lottery_data_scraper/texas.py diff --git a/lottery_data_scraper/texas.py b/lottery_data_scraper/texas.py new file mode 100644 index 0000000..41fe334 --- /dev/null +++ b/lottery_data_scraper/texas.py @@ -0,0 +1,140 @@ +""" +Scrapes the Louisiana lottery website for scratch-off ticket +data and calculates the expected value for each game. + +Louisiana publishes the number of tickets printed and how many +tickets are printed at each prize level. + +We can calculated the expected value of a game by summing +the value of all the prizes and dividing that by the cost +of all the tickets. + +The texas lottery website has an "top prizes remaining" or an "index" page that +has links to every game that could still be profitable. +Each individual game has a section for the "game rules" page and a prize table. +We can use each individual game page to gather the important data, and +then run our calculations. + +Website that we'll be scraping: +http://www.txlottery.org/export/sites/lottery/Games/Scratch_Offs/all.html + +Example usage: + python -m texas +Or: + LOGLEVEL=DEBUG USE_CACHE=True python -m texas + +The following behavior is configurable through shell environment variables. + +Set LOGLEVEL to print useful debug info to console. +LOGLEVEL=[DEBUG,INFO,WARNING,ERROR,CRITICAL] +Defaults to WARNING. + +Set USE_CACHE to cache responses. This speeds up development +and is nice to the servers we're hitting. +USE_CACHE=[True] +Defaults to False. Note: Setting this env variable to the string False +will cause it to use cache because the string "False" evaluates to Truthy. +Either set it to True or don't set it. +""" + +import logging +import os +import re +from xmlrpc import client + +from bs4 import BeautifulSoup as bs +import pandas as pd +import requests +from lottery_data_scraper.schemas import GameSchema +from lottery_data_scraper.util import fetch_html + +logger = logging.getLogger(__name__) + +BASE_URL = "http://www.txlottery.org" +INDEX_URL = ( + "http://www.txlottery.org/export/sites/lottery/Games/Scratch_Offs/all.html" +) + + +def parse_index(html): + soup = bs(html, "lxml") + table = soup.find("table") + game_hrefs = table.select("tr > td > a") + game_urls = list(map(lambda x: BASE_URL + x.attrs["href"], game_hrefs)) + return game_urls + + +def parse_game(url, html): + soup = bs(html, "lxml") + price = int( + re.match( + r"\$(\d+)", + soup.select("h3 > img")[0].attrs["alt"] + ).group(1) + ) + game_details = soup.select(".large-4.cell > h3")[0].parent.text.strip() + title = soup.select(".large-12.cell > .text-center > h2")[0].text.split(" - ") + name = title[1] + num = title[0][-4:] + num_tx = int( + re.match( + r".*?([\d,]+)", + soup.find(string=re.compile(r"There are approximately [\d,]+.*")).strip() + ).group(1).replace(",", "") + ) + # Prizes + table = soup.find("table") + df = pd.read_html(str(table))[0] + df = df.replace("---", 0) + df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "", regex=False) # noqa: E231 + prizes = [] + for prize, total, claimed in [list(r[1]) for r in df.iterrows()]: + match = re.match(r"\$?([\d,]+).*wk.*", prize) + if match: + value = float(match.group(1).replace(",", "")) * 20 * 52 + prize = match.group(0) + else: + value = float(prize.replace(",", "")) + prize = "$" + prize + prizes.append( + { + "prize": prize, + "value": value, + "claimed": int(claimed), + "available": int(total) - int(claimed), + } + ) + game = { + "name": name, + "game_id": num, + "url": url, + "price": price, + "state": "tx", + "num_tx_initial": num_tx, + "prizes": prizes, + } + return game + + +def _parse_game(url, html): + try: + return parse_game(url, html) + except Exception as e: + logger.warning("Unable to parse {}.\n{}".format(url, e)) + return None + + +def main(): + index_html = fetch_html(INDEX_URL) + game_urls = parse_index(index_html) + url_htmls = zip(game_urls, [fetch_html(url) for url in game_urls]) + games = [_parse_game(url, html) for url, html in url_htmls] + games = [game for game in games if game is not None] + return games + + + +if __name__ == "__main__": + games = main() + schema = GameSchema(many=True) + print(schema.dumps(games)) \ No newline at end of file From 879d08ffaf07aa1d4705178060c8da6d24d45dd9 Mon Sep 17 00:00:00 2001 From: tdhood Date: Thu, 20 Apr 2023 09:21:34 -0700 Subject: [PATCH 2/6] cleaned up --- lottery_data_scraper/texas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lottery_data_scraper/texas.py b/lottery_data_scraper/texas.py index 41fe334..641f1ad 100644 --- a/lottery_data_scraper/texas.py +++ b/lottery_data_scraper/texas.py @@ -86,7 +86,7 @@ def parse_game(url, html): table = soup.find("table") df = pd.read_html(str(table))[0] df = df.replace("---", 0) - df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "", regex=False) # noqa: E231 + df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "") # noqa: E231 prizes = [] for prize, total, claimed in [list(r[1]) for r in df.iterrows()]: match = re.match(r"\$?([\d,]+).*wk.*", prize) From ebe3e6e174fc66e8ef46569535fa659ff8312589 Mon Sep 17 00:00:00 2001 From: tdhood Date: Thu, 20 Apr 2023 09:22:14 -0700 Subject: [PATCH 3/6] cleaned up --- lottery_data_scraper/texas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lottery_data_scraper/texas.py b/lottery_data_scraper/texas.py index 641f1ad..6b91ca9 100644 --- a/lottery_data_scraper/texas.py +++ b/lottery_data_scraper/texas.py @@ -86,7 +86,7 @@ def parse_game(url, html): table = soup.find("table") df = pd.read_html(str(table))[0] df = df.replace("---", 0) - df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "") # noqa: E231 + df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "") prizes = [] for prize, total, claimed in [list(r[1]) for r in df.iterrows()]: match = re.match(r"\$?([\d,]+).*wk.*", prize) From 2ef19fe207464317993a259121fd88e19c020255 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Thu, 20 Apr 2023 22:35:00 -0500 Subject: [PATCH 4/6] Update references of Louisiana to be Texas --- lottery_data_scraper/texas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lottery_data_scraper/texas.py b/lottery_data_scraper/texas.py index 6b91ca9..388ca97 100644 --- a/lottery_data_scraper/texas.py +++ b/lottery_data_scraper/texas.py @@ -1,8 +1,8 @@ """ -Scrapes the Louisiana lottery website for scratch-off ticket +Scrapes the Texas lottery website for scratch-off ticket data and calculates the expected value for each game. -Louisiana publishes the number of tickets printed and how many +Texas publishes the number of tickets printed and how many tickets are printed at each prize level. We can calculated the expected value of a game by summing @@ -137,4 +137,4 @@ def main(): if __name__ == "__main__": games = main() schema = GameSchema(many=True) - print(schema.dumps(games)) \ No newline at end of file + print(schema.dumps(games)) From 765b0039f7c82d16bdd8a62458e431945458a338 Mon Sep 17 00:00:00 2001 From: tdhood Date: Thu, 20 Apr 2023 20:44:59 -0700 Subject: [PATCH 5/6] added new line --- lottery_data_scraper/texas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lottery_data_scraper/texas.py b/lottery_data_scraper/texas.py index 6b91ca9..c4f4b6a 100644 --- a/lottery_data_scraper/texas.py +++ b/lottery_data_scraper/texas.py @@ -137,4 +137,5 @@ def main(): if __name__ == "__main__": games = main() schema = GameSchema(many=True) - print(schema.dumps(games)) \ No newline at end of file + print(schema.dumps(games)) + \ No newline at end of file From 67c1e628607d0dfbcf472da5e830f4155b1d317b Mon Sep 17 00:00:00 2001 From: tdhood Date: Thu, 20 Apr 2023 20:47:40 -0700 Subject: [PATCH 6/6] added new line --- lottery_data_scraper/texas.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lottery_data_scraper/texas.py b/lottery_data_scraper/texas.py index c4f4b6a..61b2fa3 100644 --- a/lottery_data_scraper/texas.py +++ b/lottery_data_scraper/texas.py @@ -133,9 +133,7 @@ def main(): return games - if __name__ == "__main__": games = main() schema = GameSchema(many=True) print(schema.dumps(games)) - \ No newline at end of file