From 37c40360c2dd8f47633cdea0331956caf9e57b15 Mon Sep 17 00:00:00 2001 From: tdhood Date: Sat, 6 May 2023 14:50:54 -0700 Subject: [PATCH 1/4] adding connecticut parser and test --- lottery_data_scraper/connecticut.py | 127 ++++++++++++++++++++++++++++ tests/test_connecticut.py | 17 ++++ 2 files changed, 144 insertions(+) create mode 100644 lottery_data_scraper/connecticut.py create mode 100644 tests/test_connecticut.py diff --git a/lottery_data_scraper/connecticut.py b/lottery_data_scraper/connecticut.py new file mode 100644 index 0000000..cd4ba19 --- /dev/null +++ b/lottery_data_scraper/connecticut.py @@ -0,0 +1,127 @@ +import logging +import os +import re +import sys +import traceback +from xmlrpc import client + +from bs4 import BeautifulSoup as bs +import html2text +import requests +from lottery_data_scraper.schemas import GameSchema +from lottery_data_scraper.util import fetch_html + +logger = logging.getLogger(__name__) + +h = html2text.HTML2Text() +h.ignore_links = True + +BASE = "https://www.ctlottery.org" + +INDEX = "https://ctlottery.org/ScratchGamesTable" + + +headers = { + "X-Requested-With": "XMLHttpRequest", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0", + "Referer": "https://www.ctlottery.org/ScratchGames", +} + + + +def get_games_urls(url): + html = fetch_html(url) + soup = bs(html, "lxml") + table = soup.find("table") + game_hrefs = table.select("tr > td > a") + game_urls = list(map(lambda x: BASE + x.attrs["href"], game_hrefs)) + return game_urls + +def parse_game(game_url): + # Each game page has two tables + # Table 1: Ticket Price, Num_Tx_remaining, Odds + # Table 2: Prize Table + + game_html = fetch_html(game_url) + game_soup = bs(game_html, "lxml") + + name = game_soup.find("h2").text + game_id = re.match(r"GAME #(\d*)",game_soup.find(class_="heading-sub-info").text).group(1) + + #soup for table 1 + table_one = game_soup.find(class_="img-detail-block") + + price = int(re.search(r"Ticket Price:\$(\d*)", table_one.text).group(1)) + + num_tx_str = re.search(r"Total # of Tickets:([\d*][,\d*]+)", table_one.text).group(1) + num_tx_initial = int(num_tx_str.replace(",", "")) + + + #soup for table 2 + table_two = game_soup.find(class_="unclaimed-prize-wrap") + prize_rows = ( + table_two.find("tbody").find_all("tr") + ) + prizes = [] + for row in prize_rows: + prize, total, available = [r.text for r in row.find_all("td")] + total = int(total.replace(",", "")) + available = int(available.replace(",", "")) + # one-off handlers... + if re.search(r"(?i)month.*for.*life", prize): + value = re.search(r"[\d,]+", prize).group() + value = float(value.replace(",", "")) * 20 * 12 + elif re.search(r"(?i)$\d+ million", prize): + value = float(re.search(r"\d+").group()) * 1000000 + else: + value = re.search(r"[\d,]+", prize).group() + value = float(value.replace("$", "").replace(",", "")) + prizes.append( + { + "prize": prize, + "value": value, + "claimed": total - available, + "available": available, + } + ) + + how_to_play_soup = game_soup.find(class_="play-text-wrap") + #remove heading and button tags + how_to_play_soup.h3.extract() + how_to_play_soup.a.extract() + + how_to_play = h.handle(how_to_play_soup.text) + + image_urls = BASE + game_soup.find(id="ticket_image").attrs["src"] + + game = { + "state": "ct", + "game_id": game_id, + "name": name, + "price": price, + # Individual games are JavaScript links + "url": game_url, + "prizes": prizes, + "num_tx_initial": num_tx_initial, + "how_to_play": how_to_play, + "image_urls": image_urls + } + return game + +def main(): + games_urls = get_games_urls(INDEX) + games = [] + for game in games_urls: + try: + game = parse_game(game) + except Exception as e: + logger.error("Unable to parse game {}.\n{}".format(game, e)) + games.append(game) + return games + + +if __name__ == "__main__": + games = main() + schema = GameSchema(many=True) + print(schema.dumps(games)) + diff --git a/tests/test_connecticut.py b/tests/test_connecticut.py new file mode 100644 index 0000000..f96af33 --- /dev/null +++ b/tests/test_connecticut.py @@ -0,0 +1,17 @@ +import unittest +import requests + +from lottery_data_scraper import connecticut +from lottery_data_scraper import schemas + +class TestConnecticut(unittest.TestCase): + def test_parse_game_html(self): + # URL chosen arbitrarily + url = 'https://www.ctlottery.org/ScratchGames/1740/' + game = connecticut.parse_game(url) + self.assertEqual(game['name'], 'Extreme Green') + self.assertEqual(game["price"], 10) + self.assertEqual(game["game_id"], "1740") + self.assertEqual(game["prizes"][0]["prize"], "$100,000") + self.assertEqual(game["prizes"][0]["value"], 100000) + self.assertEqual(game["num_tx_initial"], 2230800) \ No newline at end of file From bb1014f5416599f12ce68e0c3e7b31d59fc043c1 Mon Sep 17 00:00:00 2001 From: tdhood Date: Sat, 6 May 2023 15:55:45 -0700 Subject: [PATCH 2/4] fixed issue --- lottery_data_scraper/connecticut.py | 32 +++++++++++++++-------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/lottery_data_scraper/connecticut.py b/lottery_data_scraper/connecticut.py index cd4ba19..35ad1a1 100644 --- a/lottery_data_scraper/connecticut.py +++ b/lottery_data_scraper/connecticut.py @@ -8,7 +8,7 @@ from xmlrpc import client from bs4 import BeautifulSoup as bs import html2text import requests -from lottery_data_scraper.schemas import GameSchema +from lottery_data_scraper.schemas import GameSchema from lottery_data_scraper.util import fetch_html logger = logging.getLogger(__name__) @@ -28,7 +28,6 @@ headers = { } - def get_games_urls(url): html = fetch_html(url) soup = bs(html, "lxml") @@ -37,6 +36,7 @@ def get_games_urls(url): game_urls = list(map(lambda x: BASE + x.attrs["href"], game_hrefs)) return game_urls + def parse_game(game_url): # Each game page has two tables # Table 1: Ticket Price, Num_Tx_remaining, Odds @@ -46,22 +46,23 @@ def parse_game(game_url): game_soup = bs(game_html, "lxml") name = game_soup.find("h2").text - game_id = re.match(r"GAME #(\d*)",game_soup.find(class_="heading-sub-info").text).group(1) + game_id = re.match( + r"GAME #(\d*)", game_soup.find(class_="heading-sub-info").text + ).group(1) - #soup for table 1 + # soup for table 1 table_one = game_soup.find(class_="img-detail-block") price = int(re.search(r"Ticket Price:\$(\d*)", table_one.text).group(1)) - - num_tx_str = re.search(r"Total # of Tickets:([\d*][,\d*]+)", table_one.text).group(1) - num_tx_initial = int(num_tx_str.replace(",", "")) + num_tx_str = re.search(r"Total # of Tickets:([\d*][,\d*]+)", table_one.text).group( + 1 + ) + num_tx_initial = int(num_tx_str.replace(",", "")) - #soup for table 2 + # soup for table 2 table_two = game_soup.find(class_="unclaimed-prize-wrap") - prize_rows = ( - table_two.find("tbody").find_all("tr") - ) + prize_rows = table_two.find("tbody").find_all("tr") prizes = [] for row in prize_rows: prize, total, available = [r.text for r in row.find_all("td")] @@ -86,7 +87,7 @@ def parse_game(game_url): ) how_to_play_soup = game_soup.find(class_="play-text-wrap") - #remove heading and button tags + # remove heading and button tags how_to_play_soup.h3.extract() how_to_play_soup.a.extract() @@ -104,10 +105,11 @@ def parse_game(game_url): "prizes": prizes, "num_tx_initial": num_tx_initial, "how_to_play": how_to_play, - "image_urls": image_urls + "image_urls": image_urls, } return game + def main(): games_urls = get_games_urls(INDEX) games = [] @@ -116,7 +118,8 @@ def main(): game = parse_game(game) except Exception as e: logger.error("Unable to parse game {}.\n{}".format(game, e)) - games.append(game) + continue + games.append(game) return games @@ -124,4 +127,3 @@ if __name__ == "__main__": games = main() schema = GameSchema(many=True) print(schema.dumps(games)) - From 57b83d096e082c3de3ae2f3c48e0af4421f4b425 Mon Sep 17 00:00:00 2001 From: tdhood Date: Sat, 6 May 2023 16:18:14 -0700 Subject: [PATCH 3/4] removed unused imports --- lottery_data_scraper/connecticut.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lottery_data_scraper/connecticut.py b/lottery_data_scraper/connecticut.py index 35ad1a1..a049aaf 100644 --- a/lottery_data_scraper/connecticut.py +++ b/lottery_data_scraper/connecticut.py @@ -1,13 +1,8 @@ import logging -import os import re -import sys -import traceback -from xmlrpc import client from bs4 import BeautifulSoup as bs import html2text -import requests from lottery_data_scraper.schemas import GameSchema from lottery_data_scraper.util import fetch_html From dd616da48aa0f740d28f21abab463d7507e14a44 Mon Sep 17 00:00:00 2001 From: tdhood Date: Tue, 9 May 2023 13:16:23 -0700 Subject: [PATCH 4/4] returning image_urls as a list --- lottery_data_scraper/connecticut.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lottery_data_scraper/connecticut.py b/lottery_data_scraper/connecticut.py index a049aaf..15c45d1 100644 --- a/lottery_data_scraper/connecticut.py +++ b/lottery_data_scraper/connecticut.py @@ -100,7 +100,7 @@ def parse_game(game_url): "prizes": prizes, "num_tx_initial": num_tx_initial, "how_to_play": how_to_play, - "image_urls": image_urls, + "image_urls": [image_urls], } return game