From 37c40360c2dd8f47633cdea0331956caf9e57b15 Mon Sep 17 00:00:00 2001 From: tdhood Date: Sat, 6 May 2023 14:50:54 -0700 Subject: [PATCH] adding connecticut parser and test --- lottery_data_scraper/connecticut.py | 127 ++++++++++++++++++++++++++++ tests/test_connecticut.py | 17 ++++ 2 files changed, 144 insertions(+) create mode 100644 lottery_data_scraper/connecticut.py create mode 100644 tests/test_connecticut.py diff --git a/lottery_data_scraper/connecticut.py b/lottery_data_scraper/connecticut.py new file mode 100644 index 0000000..cd4ba19 --- /dev/null +++ b/lottery_data_scraper/connecticut.py @@ -0,0 +1,127 @@ +import logging +import os +import re +import sys +import traceback +from xmlrpc import client + +from bs4 import BeautifulSoup as bs +import html2text +import requests +from lottery_data_scraper.schemas import GameSchema +from lottery_data_scraper.util import fetch_html + +logger = logging.getLogger(__name__) + +h = html2text.HTML2Text() +h.ignore_links = True + +BASE = "https://www.ctlottery.org" + +INDEX = "https://ctlottery.org/ScratchGamesTable" + + +headers = { + "X-Requested-With": "XMLHttpRequest", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0", + "Referer": "https://www.ctlottery.org/ScratchGames", +} + + + +def get_games_urls(url): + html = fetch_html(url) + soup = bs(html, "lxml") + table = soup.find("table") + game_hrefs = table.select("tr > td > a") + game_urls = list(map(lambda x: BASE + x.attrs["href"], game_hrefs)) + return game_urls + +def parse_game(game_url): + # Each game page has two tables + # Table 1: Ticket Price, Num_Tx_remaining, Odds + # Table 2: Prize Table + + game_html = fetch_html(game_url) + game_soup = bs(game_html, "lxml") + + name = game_soup.find("h2").text + game_id = re.match(r"GAME #(\d*)",game_soup.find(class_="heading-sub-info").text).group(1) + + #soup for table 1 + table_one = game_soup.find(class_="img-detail-block") + + price = int(re.search(r"Ticket Price:\$(\d*)", table_one.text).group(1)) + + num_tx_str = re.search(r"Total # of Tickets:([\d*][,\d*]+)", table_one.text).group(1) + num_tx_initial = int(num_tx_str.replace(",", "")) + + + #soup for table 2 + table_two = game_soup.find(class_="unclaimed-prize-wrap") + prize_rows = ( + table_two.find("tbody").find_all("tr") + ) + prizes = [] + for row in prize_rows: + prize, total, available = [r.text for r in row.find_all("td")] + total = int(total.replace(",", "")) + available = int(available.replace(",", "")) + # one-off handlers... + if re.search(r"(?i)month.*for.*life", prize): + value = re.search(r"[\d,]+", prize).group() + value = float(value.replace(",", "")) * 20 * 12 + elif re.search(r"(?i)$\d+ million", prize): + value = float(re.search(r"\d+").group()) * 1000000 + else: + value = re.search(r"[\d,]+", prize).group() + value = float(value.replace("$", "").replace(",", "")) + prizes.append( + { + "prize": prize, + "value": value, + "claimed": total - available, + "available": available, + } + ) + + how_to_play_soup = game_soup.find(class_="play-text-wrap") + #remove heading and button tags + how_to_play_soup.h3.extract() + how_to_play_soup.a.extract() + + how_to_play = h.handle(how_to_play_soup.text) + + image_urls = BASE + game_soup.find(id="ticket_image").attrs["src"] + + game = { + "state": "ct", + "game_id": game_id, + "name": name, + "price": price, + # Individual games are JavaScript links + "url": game_url, + "prizes": prizes, + "num_tx_initial": num_tx_initial, + "how_to_play": how_to_play, + "image_urls": image_urls + } + return game + +def main(): + games_urls = get_games_urls(INDEX) + games = [] + for game in games_urls: + try: + game = parse_game(game) + except Exception as e: + logger.error("Unable to parse game {}.\n{}".format(game, e)) + games.append(game) + return games + + +if __name__ == "__main__": + games = main() + schema = GameSchema(many=True) + print(schema.dumps(games)) + diff --git a/tests/test_connecticut.py b/tests/test_connecticut.py new file mode 100644 index 0000000..f96af33 --- /dev/null +++ b/tests/test_connecticut.py @@ -0,0 +1,17 @@ +import unittest +import requests + +from lottery_data_scraper import connecticut +from lottery_data_scraper import schemas + +class TestConnecticut(unittest.TestCase): + def test_parse_game_html(self): + # URL chosen arbitrarily + url = 'https://www.ctlottery.org/ScratchGames/1740/' + game = connecticut.parse_game(url) + self.assertEqual(game['name'], 'Extreme Green') + self.assertEqual(game["price"], 10) + self.assertEqual(game["game_id"], "1740") + self.assertEqual(game["prizes"][0]["prize"], "$100,000") + self.assertEqual(game["prizes"][0]["value"], 100000) + self.assertEqual(game["num_tx_initial"], 2230800) \ No newline at end of file