adding florida parser and test

3 years ago · e3c0b0fbdc
parent 904a5d2462
commit e3c0b0fbdc
2 changed files with 111 additions and 0 deletions
--- a/lottery_data_scraper/florida.py
+++ b/lottery_data_scraper/florida.py
@ -0,0 +1,94 @@
+import logging
+from functools import partial
+import os
+import re
+
+from bs4 import BeautifulSoup as bs
+import html2text
+import requests
+
+from lottery_data_scraper.schemas import GameSchema
+from lottery_data_scraper.util import fetch_html
+
+logger = logging.getLogger(__name__)
+
+BASE = "https://flalottery.com/"
+INDEX = "https://flalottery.com/remainingPrizes"
+h = html2text.HTML2Text()
+
+
+def parse_game(url):
+    html = fetch_html(url)
+    soup = bs(html, 'lxml')
+
+    title = soup.select("#scratch-offs > h1")[0].text
+    uid, name = title[1:].split(" – ")
+
+    details_content = soup.find("div", "ticketDetailsContent")
+
+    how_to_play = h.handle(str(details_content.find_all("p")[1]))
+
+    price_paragraph = details_content.find(
+        string=re.compile(r"Ticket Price:")
+    ).parent.parent
+    price = float(re.search(r"\$(\d+\.\d+)", price_paragraph.text).group(1))
+    table = soup.find("table", "scratchOdds").find("tbody")
+    prize_rows = table.select("tr")
+
+    # Some FL tickets are $X/Year for life.
+    # "Life" in Florida is 20 years.
+    def get_value(prize):
+        if re.search(r"(Year|Yr)", prize, re.IGNORECASE):
+            return float(re.sub(r'[^\d\.]', '', prize)) * 20
+        elif re.search(r"(Week|Wk)", prize, re.IGNORECASE):
+            return float(re.sub(r'[^\d\.]', '', prize)) * 52 * 20
+        else:
+            return float(re.sub(r'[^\d\.]', '', prize))
+
+    prizes = [
+        {
+            "prize": row[0].text,
+            "value": get_value(row[0].text),
+            "available": int(row[3].text.replace(",", "")),
+            "claimed": int(row[2].text.replace(",", "")) - int(row[3].text.replace(",", "")),
+        }
+        for row in [row.find_all("td") for row in prize_rows]
+    ]
+    top_prize_odds = float(
+        prize_rows[0].find_all("td")[1].text.split("-in-")[1].replace(",", "")
+    )
+    num_tx_initial = (prizes[0]["available"] + prizes[0]["claimed"]) * top_prize_odds
+    image_url = soup.find("img", "ticketPicture").attrs["src"]
+    
+    game = {
+        "name": name,
+        "game_id": uid,
+        "how_to_play": how_to_play,
+        "price": price,
+        "state": "fl",
+        "num_tx_initial": num_tx_initial,
+        "image_urls": [image_url],
+        "url": url,
+        "prizes": prizes,
+    }
+    return game
+
+
+def main():
+    index = fetch_html(INDEX)
+    soup = bs(index, "lxml")
+    game_urls = [BASE + t["href"] for t in soup.select(".gameNameLink > a")]
+    games = []
+
+    for url in game_urls:
+        try:
+            game = parse_game(url)
+        except Exception as e:
+            logger.error("Unable to process {}.\n{}".format(url, e))
+        games.append(game)
+    return games
+
+if __name__ == "__main__":
+    games = main()
+    schema = GameSchema(many=True)
+    print(schema.dumps(games))
--- a/tests/test_florida.py
+++ b/tests/test_florida.py
@ -0,0 +1,17 @@
+import unittest
+import requests
+
+from lottery_data_scraper import florida
+from lottery_data_scraper import schemas
+
+class TestFlorida(unittest.TestCase):
+    def test_parse_game_html(self):
+        # URL chosen arbitrarily
+        url = "https://flalottery.com/scratch-offsGameDetails?gameNumber=7025"
+        game = florida.parse_game(url)
+        self.assertEqual(game['name'], "MYSTERY MULTIPLIER")
+        self.assertEqual(game["price"], 10)
+        self.assertEqual(game["num_tx_initial"], 20513700)
+        self.assertEqual(game["game_id"], "7025")
+        self.assertEqual(game["prizes"][0]["prize"], "$1,000,000.00")
+        self.assertEqual(game["prizes"][0]["value"], 1000000)