diff --git a/lottery_data_scraper/florida.py b/lottery_data_scraper/florida.py new file mode 100644 index 0000000..8030850 --- /dev/null +++ b/lottery_data_scraper/florida.py @@ -0,0 +1,94 @@ +import logging +from functools import partial +import os +import re + +from bs4 import BeautifulSoup as bs +import html2text +import requests + +from lottery_data_scraper.schemas import GameSchema +from lottery_data_scraper.util import fetch_html + +logger = logging.getLogger(__name__) + +BASE = "https://flalottery.com/" +INDEX = "https://flalottery.com/remainingPrizes" +h = html2text.HTML2Text() + + +def parse_game(url): + html = fetch_html(url) + soup = bs(html, 'lxml') + + title = soup.select("#scratch-offs > h1")[0].text + uid, name = title[1:].split(" – ") + + details_content = soup.find("div", "ticketDetailsContent") + + how_to_play = h.handle(str(details_content.find_all("p")[1])) + + price_paragraph = details_content.find( + string=re.compile(r"Ticket Price:") + ).parent.parent + price = float(re.search(r"\$(\d+\.\d+)", price_paragraph.text).group(1)) + table = soup.find("table", "scratchOdds").find("tbody") + prize_rows = table.select("tr") + + # Some FL tickets are $X/Year for life. + # "Life" in Florida is 20 years. + def get_value(prize): + if re.search(r"(Year|Yr)", prize, re.IGNORECASE): + return float(re.sub(r'[^\d\.]', '', prize)) * 20 + elif re.search(r"(Week|Wk)", prize, re.IGNORECASE): + return float(re.sub(r'[^\d\.]', '', prize)) * 52 * 20 + else: + return float(re.sub(r'[^\d\.]', '', prize)) + + prizes = [ + { + "prize": row[0].text, + "value": get_value(row[0].text), + "available": int(row[3].text.replace(",", "")), + "claimed": int(row[2].text.replace(",", "")) - int(row[3].text.replace(",", "")), + } + for row in [row.find_all("td") for row in prize_rows] + ] + top_prize_odds = float( + prize_rows[0].find_all("td")[1].text.split("-in-")[1].replace(",", "") + ) + num_tx_initial = (prizes[0]["available"] + prizes[0]["claimed"]) * top_prize_odds + image_url = soup.find("img", "ticketPicture").attrs["src"] + + game = { + "name": name, + "game_id": uid, + "how_to_play": how_to_play, + "price": price, + "state": "fl", + "num_tx_initial": num_tx_initial, + "image_urls": [image_url], + "url": url, + "prizes": prizes, + } + return game + + +def main(): + index = fetch_html(INDEX) + soup = bs(index, "lxml") + game_urls = [BASE + t["href"] for t in soup.select(".gameNameLink > a")] + games = [] + + for url in game_urls: + try: + game = parse_game(url) + except Exception as e: + logger.error("Unable to process {}.\n{}".format(url, e)) + games.append(game) + return games + +if __name__ == "__main__": + games = main() + schema = GameSchema(many=True) + print(schema.dumps(games)) diff --git a/tests/test_florida.py b/tests/test_florida.py new file mode 100644 index 0000000..8d00eb8 --- /dev/null +++ b/tests/test_florida.py @@ -0,0 +1,17 @@ +import unittest +import requests + +from lottery_data_scraper import florida +from lottery_data_scraper import schemas + +class TestFlorida(unittest.TestCase): + def test_parse_game_html(self): + # URL chosen arbitrarily + url = "https://flalottery.com/scratch-offsGameDetails?gameNumber=7025" + game = florida.parse_game(url) + self.assertEqual(game['name'], "MYSTERY MULTIPLIER") + self.assertEqual(game["price"], 10) + self.assertEqual(game["num_tx_initial"], 20513700) + self.assertEqual(game["game_id"], "7025") + self.assertEqual(game["prizes"][0]["prize"], "$1,000,000.00") + self.assertEqual(game["prizes"][0]["value"], 1000000) \ No newline at end of file