adding arkansas parser and test

3 years ago · 78ae5a45ee
parent 904a5d2462
commit 78ae5a45ee
2 changed files with 115 additions and 0 deletions
--- a/lottery_data_scraper/arkansas.py
+++ b/lottery_data_scraper/arkansas.py
@ -0,0 +1,97 @@
+import logging
+import os
+import re
+from xmlrpc import client
+
+from bs4 import BeautifulSoup as bs
+import pandas as pd
+import requests
+from lottery_data_scraper.schemas import GameSchema 
+from lottery_data_scraper.util import fetch_html
+
+logger = logging.getLogger(__name__)
+
+BASE_URL = "https://www.myarkansaslottery.com"
+INDEX_URL = "https://www.myarkansaslottery.com/games/instant?amount=All"
+
+
+def game_urls():
+    index = requests.get(INDEX_URL).text
+    soup = bs(index, "lxml")
+    page_hrefs = soup.find_all("a", title=re.compile("Go to page"))
+    page_links = [BASE_URL + l.attrs["href"] for l in page_hrefs]
+    page_htmls = [index] + [requests.get(page_link).text for page_link in page_links]
+    game_links = []
+    for page_html in page_htmls:
+        page_soup = bs(page_html, "lxml")
+        game_hrefs = page_soup.select(
+            'article[class~="node-instant-game"] \
+            div[class~="field-name-title-field"] a'
+        )
+        game_links += [BASE_URL + l.attrs["href"] for l in game_hrefs]
+    return game_links
+
+
+def num_tickets(soup):
+    els = soup.select('[data-cell-title="Total Prizes:"]')
+    num_winning_tx = sum(map(lambda x: int(x.text.replace(",", "")), els))
+    odds = float(
+        soup.find(class_="field-name-field-game-odds").text.split(" in ")[1].strip()
+    )
+    return num_winning_tx * odds
+
+
+def parse_game(url, html):
+    logger.debug(f"Parsing {url}")
+    soup = bs(html, "lxml")
+    price = soup.find(class_="field-name-field-ticket-price").text.split("$")[1].strip()
+    name = soup.find("div", class_="field-name-title-field").text.strip()
+    num = soup.find(class_="field-name-field-game-number").text.split("No.")[1].strip()
+    num_tx = int(num_tickets(soup))
+    table = soup.find("table")
+    df = pd.read_html(str(table))[0]
+    df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "")
+    prizes = [
+        {
+            "prize": prize,
+            "value": float(prize.replace(",", "")),
+            "claimed": int(claimed),
+            "available": int(total) - int(claimed),
+        }
+        for prize, total, claimed in [
+            [r[1][0], r[1][1], r[1][1] - r[1][2]] for r in df.iterrows()
+        ]
+    ]
+
+    game = {
+        "name": name,
+        "game_id": num,
+        "url": url,
+        "state": "ar",
+        "price": float(price),
+        "num_tx_initial": num_tx,
+        "prizes": prizes,
+    }
+    return game
+
+
+def main():
+    urls = game_urls()
+    url_htmls = zip(urls, [fetch_html(url) for url in urls])
+    games = []
+    for url, html in url_htmls:
+        try:
+            game = parse_game(url, html)
+        except Exception as e:
+            logger.error("Unable to parse {}.\n>{}".format(url, e))
+            continue
+        games.append(game)
+    return games
+
+
+
+
+if __name__ == "__main__":
+    games = main()
+    schema = GameSchema(many=True)
+    print(schema.dumps(games))
--- a/tests/test_arkansas.py
+++ b/tests/test_arkansas.py
@ -0,0 +1,18 @@
+import unittest
+import requests
+
+from lottery_data_scraper import arkansas
+from lottery_data_scraper import schemas
+
+class TestArkansas(unittest.TestCase):
+    def test_parse_game_html(self):
+        # URL chosen arbitrarily -- first game returned in list
+        url = 'https://www.myarkansaslottery.com/games/200000-jackpot-1'
+        html = arkansas.fetch_html(url)
+        game = arkansas.parse_game(url, html)
+        self.assertEqual(game['name'], '$200,000 Jackpot')
+        self.assertEqual(game["price"], 10)
+        self.assertEqual(game["game_id"], "732")
+        self.assertEqual(game["num_tx_initial"], 1132310)
+        self.assertEqual(game["prizes"][0]["prize"], "200,000")
+        self.assertEqual(game["prizes"][0]["value"], 200000)