From 9215af12a3489a83f8ee944cd8375944845bab79 Mon Sep 17 00:00:00 2001 From: tdhood Date: Mon, 10 Apr 2023 18:57:25 -0700 Subject: [PATCH] added louisiana.py --- lottery_data_scraper/louisiana.py | 85 +++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 lottery_data_scraper/louisiana.py diff --git a/lottery_data_scraper/louisiana.py b/lottery_data_scraper/louisiana.py new file mode 100644 index 0000000..149c89d --- /dev/null +++ b/lottery_data_scraper/louisiana.py @@ -0,0 +1,85 @@ +import sys +import traceback +from copy import deepcopy +import locale +import logging +import os +import re +from bs4 import BeautifulSoup as bs +import requests +import pandas as pd +from lottery_data_scraper.schemas import GameSchema +from lottery_data_scraper.util import fetch_html + +logger = logging.getLogger(__name__) + +# It's worth assigning to constants values that are used in many +# places throughout a script. +BASE_URL = "http://www.louisianalottery.com" +INDEX_URL = "https://louisianalottery.com/scratch-offs/top-prizes-remaining" + +def parse_index(html): + soup = bs(html, "lxml") + table = soup.find("table") + game_hrefs = table.select("tr > td > a") + game_urls = list(map(lambda x: "https:" + x.attrs["href"], game_hrefs)) + return game_urls + + +def parse_game(url, html): + soup = bs(html, "lxml") + price = soup.select('div[id="scratch-off-prize-info"] td')[1].text.replace("$", "") + name = soup.find(class_="scratch-off-title").text + num = url.split("/")[-2] + grand_prize_row = soup.select( + 'div[id="scratch-off-table-tier"] table > tbody > tr' + )[0] + grand_prize_odds = float( + grand_prize_row.select("td")[1].text.split(" in ")[1].replace(",", "",) + ) + grand_prize_num = int(grand_prize_row.select("td")[2].text) + num_tx = int(grand_prize_odds * grand_prize_num) + table = soup.find_all("table")[2] + df = pd.read_html(str(table))[0] + df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "") # noqa: E231 + df = df.replace("TICKET", price) + prizes = [ + { + "prize": prize, + "value": float(prize.replace(",", "")), + "claimed": int(claimed), + "available": int(total) - int(claimed), + } + for prize, _, total, claimed in [list(r[1])[:4] for r in df.iterrows()] + ] + game = { + "name": name, + "game_id": num, + "url": url, + "state": "la", + "price": float(price), + "num_tx_initial": num_tx, + "prizes": prizes, + } + return game + + +def main(): + index_html = requests.get(INDEX_URL).text + game_urls = parse_index(index_html) + url_htmls = zip(game_urls, [requests.get(url).text for url in game_urls]) + games = [] + for url, html in url_htmls: + try: + game = parse_game(url, html) + except Exception as e: + logger.warn("Unable to parse {}.\n{}".format(url, e)) + continue + games.append(game) + return games + + +if __name__ == "__main__": + games = main() + schema = GameSchema(many=True) + print(schema.dumps(games)) \ No newline at end of file