adding florida parser and test

main
Taylor Hood 1 year ago
parent 904a5d2462
commit e3c0b0fbdc

@ -0,0 +1,94 @@
import logging
from functools import partial
import os
import re
from bs4 import BeautifulSoup as bs
import html2text
import requests
from lottery_data_scraper.schemas import GameSchema
from lottery_data_scraper.util import fetch_html
logger = logging.getLogger(__name__)
BASE = "https://flalottery.com/"
INDEX = "https://flalottery.com/remainingPrizes"
h = html2text.HTML2Text()
def parse_game(url):
html = fetch_html(url)
soup = bs(html, 'lxml')
title = soup.select("#scratch-offs > h1")[0].text
uid, name = title[1:].split(" ")
details_content = soup.find("div", "ticketDetailsContent")
how_to_play = h.handle(str(details_content.find_all("p")[1]))
price_paragraph = details_content.find(
string=re.compile(r"Ticket Price:")
).parent.parent
price = float(re.search(r"\$(\d+\.\d+)", price_paragraph.text).group(1))
table = soup.find("table", "scratchOdds").find("tbody")
prize_rows = table.select("tr")
# Some FL tickets are $X/Year for life.
# "Life" in Florida is 20 years.
def get_value(prize):
if re.search(r"(Year|Yr)", prize, re.IGNORECASE):
return float(re.sub(r'[^\d\.]', '', prize)) * 20
elif re.search(r"(Week|Wk)", prize, re.IGNORECASE):
return float(re.sub(r'[^\d\.]', '', prize)) * 52 * 20
else:
return float(re.sub(r'[^\d\.]', '', prize))
prizes = [
{
"prize": row[0].text,
"value": get_value(row[0].text),
"available": int(row[3].text.replace(",", "")),
"claimed": int(row[2].text.replace(",", "")) - int(row[3].text.replace(",", "")),
}
for row in [row.find_all("td") for row in prize_rows]
]
top_prize_odds = float(
prize_rows[0].find_all("td")[1].text.split("-in-")[1].replace(",", "")
)
num_tx_initial = (prizes[0]["available"] + prizes[0]["claimed"]) * top_prize_odds
image_url = soup.find("img", "ticketPicture").attrs["src"]
game = {
"name": name,
"game_id": uid,
"how_to_play": how_to_play,
"price": price,
"state": "fl",
"num_tx_initial": num_tx_initial,
"image_urls": [image_url],
"url": url,
"prizes": prizes,
}
return game
def main():
index = fetch_html(INDEX)
soup = bs(index, "lxml")
game_urls = [BASE + t["href"] for t in soup.select(".gameNameLink > a")]
games = []
for url in game_urls:
try:
game = parse_game(url)
except Exception as e:
logger.error("Unable to process {}.\n{}".format(url, e))
games.append(game)
return games
if __name__ == "__main__":
games = main()
schema = GameSchema(many=True)
print(schema.dumps(games))

@ -0,0 +1,17 @@
import unittest
import requests
from lottery_data_scraper import florida
from lottery_data_scraper import schemas
class TestFlorida(unittest.TestCase):
def test_parse_game_html(self):
# URL chosen arbitrarily
url = "https://flalottery.com/scratch-offsGameDetails?gameNumber=7025"
game = florida.parse_game(url)
self.assertEqual(game['name'], "MYSTERY MULTIPLIER")
self.assertEqual(game["price"], 10)
self.assertEqual(game["num_tx_initial"], 20513700)
self.assertEqual(game["game_id"], "7025")
self.assertEqual(game["prizes"][0]["prize"], "$1,000,000.00")
self.assertEqual(game["prizes"][0]["value"], 1000000)
Loading…
Cancel
Save