adding florida parser and test

3 years ago · e3c0b0fbdc
parent 904a5d2462
commit e3c0b0fbdc
2 changed files with 111 additions and 0 deletions
--- a/lottery_data_scraper/florida.py
+++ b/lottery_data_scraper/florida.py
@ -0,0 +1,94 @@
 import logging
 from functools import partial
 import os
 import re
 from bs4 import BeautifulSoup as bs
 import html2text
 import requests
 from lottery_data_scraper.schemas import GameSchema
 from lottery_data_scraper.util import fetch_html
 logger = logging.getLogger(__name__)
 BASE = "https://flalottery.com/"
 INDEX = "https://flalottery.com/remainingPrizes"
 h = html2text.HTML2Text()
 def parse_game(url):
    html = fetch_html(url)
    soup = bs(html, 'lxml')
    title = soup.select("#scratch-offs > h1")[0].text
    uid, name = title[1:].split(" – ")
    details_content = soup.find("div", "ticketDetailsContent")
    how_to_play = h.handle(str(details_content.find_all("p")[1]))
    price_paragraph = details_content.find(
        string=re.compile(r"Ticket Price:")
    ).parent.parent
    price = float(re.search(r"\$(\d+\.\d+)", price_paragraph.text).group(1))
    table = soup.find("table", "scratchOdds").find("tbody")
    prize_rows = table.select("tr")
    # Some FL tickets are $X/Year for life.
    # "Life" in Florida is 20 years.
    def get_value(prize):
        if re.search(r"(Year|Yr)", prize, re.IGNORECASE):
            return float(re.sub(r'[^\d\.]', '', prize)) * 20
        elif re.search(r"(Week|Wk)", prize, re.IGNORECASE):
            return float(re.sub(r'[^\d\.]', '', prize)) * 52 * 20
        else:
            return float(re.sub(r'[^\d\.]', '', prize))
    prizes = [
        {
            "prize": row[0].text,
            "value": get_value(row[0].text),
            "available": int(row[3].text.replace(",", "")),
            "claimed": int(row[2].text.replace(",", "")) - int(row[3].text.replace(",", "")),
        }
        for row in [row.find_all("td") for row in prize_rows]
    ]
    top_prize_odds = float(
        prize_rows[0].find_all("td")[1].text.split("-in-")[1].replace(",", "")
    )
    num_tx_initial = (prizes[0]["available"] + prizes[0]["claimed"]) * top_prize_odds
    image_url = soup.find("img", "ticketPicture").attrs["src"]
    game = {
        "name": name,
        "game_id": uid,
        "how_to_play": how_to_play,
        "price": price,
        "state": "fl",
        "num_tx_initial": num_tx_initial,
        "image_urls": [image_url],
        "url": url,
        "prizes": prizes,
    }
    return game
 def main():
    index = fetch_html(INDEX)
    soup = bs(index, "lxml")
    game_urls = [BASE + t["href"] for t in soup.select(".gameNameLink > a")]
    games = []
    for url in game_urls:
        try:
            game = parse_game(url)
        except Exception as e:
            logger.error("Unable to process {}.\n{}".format(url, e))
        games.append(game)
    return games
 if __name__ == "__main__":
    games = main()
    schema = GameSchema(many=True)
    print(schema.dumps(games))
--- a/tests/test_florida.py
+++ b/tests/test_florida.py
@ -0,0 +1,17 @@
 import unittest
 import requests
 from lottery_data_scraper import florida
 from lottery_data_scraper import schemas
 class TestFlorida(unittest.TestCase):
    def test_parse_game_html(self):
        # URL chosen arbitrarily
        url = "https://flalottery.com/scratch-offsGameDetails?gameNumber=7025"
        game = florida.parse_game(url)
        self.assertEqual(game['name'], "MYSTERY MULTIPLIER")
        self.assertEqual(game["price"], 10)
        self.assertEqual(game["num_tx_initial"], 20513700)
        self.assertEqual(game["game_id"], "7025")
        self.assertEqual(game["prizes"][0]["prize"], "$1,000,000.00")
        self.assertEqual(game["prizes"][0]["value"], 1000000)