From 445fe611829c61929435b4bb88f516898505acaa Mon Sep 17 00:00:00 2001 From: tdhood Date: Sun, 7 May 2023 16:04:52 -0700 Subject: [PATCH 1/2] added parser and tests for idaho --- lottery_data_scraper/idaho.py | 119 ++++++++++++++++++++++++++++++++++ tests/test_idaho.py | 17 +++++ 2 files changed, 136 insertions(+) create mode 100644 lottery_data_scraper/idaho.py create mode 100644 tests/test_idaho.py diff --git a/lottery_data_scraper/idaho.py b/lottery_data_scraper/idaho.py new file mode 100644 index 0000000..9c74ec5 --- /dev/null +++ b/lottery_data_scraper/idaho.py @@ -0,0 +1,119 @@ +import logging +import re + +from bs4 import BeautifulSoup as bs +import html2text + +from lottery_data_scraper.schemas import GameSchema +from lottery_data_scraper.util import fetch_html + +logger = logging.getLogger(__name__) + +h = html2text.HTML2Text() +h.ignore_links = True + +BASE = "https://www.idaholottery.com" +INDEX = "https://www.idaholottery.com/games/scratch" + +def get_games(url): + html = fetch_html(url) + soup = bs(html, "lxml") + game_urls = [BASE + n.attrs["href"] for n in soup.select(".game__inner a.image-link")] + + return game_urls + +def parse_game(url): + game_html = fetch_html(url) + game_soup = bs(game_html, "lxml") + + name = game_soup.select(".section-game h5")[0].text + + image_url = game_soup.select(".section__image-holder img")[0].attrs["src"] + + game_id = image_url.split("/")[-1].split("_")[0] + + how_to_play = h.handle(str(game_soup.find(id="tab2"))) + + price_str = game_soup.select(".list-badgets h4")[1].text + price = float(price_str.replace("$", "")) + + table = game_soup.find(class_="full-rules-and-odds") + rows_soup = table.tbody.find_all("tr") + grand_prize_soup = rows_soup[0] + total, prize, remaining, odds, _ = map( + lambda x: x.text.strip(), grand_prize_soup.find_all("td") + ) + + odds = int(odds.replace("1:", "")) + + num_tx_initial = odds * int(total) + + most_recent_percent_remaining = 1 + + prizes = [] + for total, prize, remaining, odds, _ in [ + map(lambda x: x.text.strip(), row.find_all("td")) for row in rows_soup + ]: + # Their data is dirty. Here are some hacks to try and fix it. + # Sometimes, the total is missing. + # Try to guess it. + try: + total = int(total) + except ValueError: + total = int(int(remaining) / most_recent_percent_remaining) + + value = float(prize.replace("$", "").replace(",", "")) + + try: + remaining = int(remaining) + # Sometimes, the total is less than the remaining. + if total < remaining: + total = int(remaining / most_recent_percent_remaining) + most_recent_percent_remaining = remaining / total + except ValueError: + remaining = int(total * most_recent_percent_remaining) + + # There is a typo in the $1 prize of $5x the cash. + if prize == "$1" and re.search(r"(?i)5x the cash", name): + total = 276000 # num tx / odds + remaining = total * most_recent_percent_remaining + + prizes.append( + { + "prize": prize, + "available": remaining, + "claimed": total - remaining, + "value": value, + } + ) + + game = { + "name": name, + "url": url, + "image_urls": f"[{image_url}]", + "state": "id", + "game_id": game_id, + "how_to_play": how_to_play, + "price": price, + "num_tx_initial": num_tx_initial, + "prizes": prizes + } + + return game + +def main(): + game_urls = get_games(INDEX) + games = [] + for url in game_urls: + try: + game = parse_game(url) + except Exception as e: + logger.error("Unable to parse {}.\n{}".format(url, e)) + games.append(game) + return games + +if __name__ == "__main__": + games = main() + schema = GameSchema(many=True) + print(schema.dumps(games)) + diff --git a/tests/test_idaho.py b/tests/test_idaho.py new file mode 100644 index 0000000..f0ae58e --- /dev/null +++ b/tests/test_idaho.py @@ -0,0 +1,17 @@ +import unittest +import requests + +from lottery_data_scraper import idaho +from lottery_data_scraper import schemas + +class TestIdaho(unittest.TestCase): + def test_parse_game_html(self): + # URL chosen arbitrarily + url = "https://www.idaholottery.com/games/scratch/lucky-rooster-bingo" + game = idaho.parse_game(url) + self.assertEqual(game['name'], "Lucky Rooster Bingo") + self.assertEqual(game["price"], 10) + self.assertEqual(game["game_id"], "1716") + self.assertEqual(game["prizes"][0]["prize"], "$100,000") + self.assertEqual(game["prizes"][0]["value"], 100000) + self.assertEqual(game["num_tx_initial"], 339900) From 0ac040217153bfa890eddfe9098473ddad9d314d Mon Sep 17 00:00:00 2001 From: tdhood Date: Tue, 9 May 2023 13:18:28 -0700 Subject: [PATCH 2/2] returning image_urls as a list --- lottery_data_scraper/idaho.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lottery_data_scraper/idaho.py b/lottery_data_scraper/idaho.py index 9c74ec5..617c176 100644 --- a/lottery_data_scraper/idaho.py +++ b/lottery_data_scraper/idaho.py @@ -90,7 +90,7 @@ def parse_game(url): game = { "name": name, "url": url, - "image_urls": f"[{image_url}]", + "image_urls": [image_url], "state": "id", "game_id": game_id, "how_to_play": how_to_play,