From 78ae5a45ee3282e50453bf8cff48df9643f9f3e5 Mon Sep 17 00:00:00 2001 From: tdhood Date: Thu, 4 May 2023 11:34:56 -0700 Subject: [PATCH] adding arkansas parser and test --- lottery_data_scraper/arkansas.py | 97 ++++++++++++++++++++++++++++++++ tests/test_arkansas.py | 18 ++++++ 2 files changed, 115 insertions(+) create mode 100644 lottery_data_scraper/arkansas.py create mode 100644 tests/test_arkansas.py diff --git a/lottery_data_scraper/arkansas.py b/lottery_data_scraper/arkansas.py new file mode 100644 index 0000000..0897cf5 --- /dev/null +++ b/lottery_data_scraper/arkansas.py @@ -0,0 +1,97 @@ +import logging +import os +import re +from xmlrpc import client + +from bs4 import BeautifulSoup as bs +import pandas as pd +import requests +from lottery_data_scraper.schemas import GameSchema +from lottery_data_scraper.util import fetch_html + +logger = logging.getLogger(__name__) + +BASE_URL = "https://www.myarkansaslottery.com" +INDEX_URL = "https://www.myarkansaslottery.com/games/instant?amount=All" + + +def game_urls(): + index = requests.get(INDEX_URL).text + soup = bs(index, "lxml") + page_hrefs = soup.find_all("a", title=re.compile("Go to page")) + page_links = [BASE_URL + l.attrs["href"] for l in page_hrefs] + page_htmls = [index] + [requests.get(page_link).text for page_link in page_links] + game_links = [] + for page_html in page_htmls: + page_soup = bs(page_html, "lxml") + game_hrefs = page_soup.select( + 'article[class~="node-instant-game"] \ + div[class~="field-name-title-field"] a' + ) + game_links += [BASE_URL + l.attrs["href"] for l in game_hrefs] + return game_links + + +def num_tickets(soup): + els = soup.select('[data-cell-title="Total Prizes:"]') + num_winning_tx = sum(map(lambda x: int(x.text.replace(",", "")), els)) + odds = float( + soup.find(class_="field-name-field-game-odds").text.split(" in ")[1].strip() + ) + return num_winning_tx * odds + + +def parse_game(url, html): + logger.debug(f"Parsing {url}") + soup = bs(html, "lxml") + price = soup.find(class_="field-name-field-ticket-price").text.split("$")[1].strip() + name = soup.find("div", class_="field-name-title-field").text.strip() + num = soup.find(class_="field-name-field-game-number").text.split("No.")[1].strip() + num_tx = int(num_tickets(soup)) + table = soup.find("table") + df = pd.read_html(str(table))[0] + df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "") + prizes = [ + { + "prize": prize, + "value": float(prize.replace(",", "")), + "claimed": int(claimed), + "available": int(total) - int(claimed), + } + for prize, total, claimed in [ + [r[1][0], r[1][1], r[1][1] - r[1][2]] for r in df.iterrows() + ] + ] + + game = { + "name": name, + "game_id": num, + "url": url, + "state": "ar", + "price": float(price), + "num_tx_initial": num_tx, + "prizes": prizes, + } + return game + + +def main(): + urls = game_urls() + url_htmls = zip(urls, [fetch_html(url) for url in urls]) + games = [] + for url, html in url_htmls: + try: + game = parse_game(url, html) + except Exception as e: + logger.error("Unable to parse {}.\n>{}".format(url, e)) + continue + games.append(game) + return games + + + + +if __name__ == "__main__": + games = main() + schema = GameSchema(many=True) + print(schema.dumps(games)) \ No newline at end of file diff --git a/tests/test_arkansas.py b/tests/test_arkansas.py new file mode 100644 index 0000000..61bc6fa --- /dev/null +++ b/tests/test_arkansas.py @@ -0,0 +1,18 @@ +import unittest +import requests + +from lottery_data_scraper import arkansas +from lottery_data_scraper import schemas + +class TestArkansas(unittest.TestCase): + def test_parse_game_html(self): + # URL chosen arbitrarily -- first game returned in list + url = 'https://www.myarkansaslottery.com/games/200000-jackpot-1' + html = arkansas.fetch_html(url) + game = arkansas.parse_game(url, html) + self.assertEqual(game['name'], '$200,000 Jackpot') + self.assertEqual(game["price"], 10) + self.assertEqual(game["game_id"], "732") + self.assertEqual(game["num_tx_initial"], 1132310) + self.assertEqual(game["prizes"][0]["prize"], "200,000") + self.assertEqual(game["prizes"][0]["value"], 200000) \ No newline at end of file