added parser and tests for idaho
parent
904a5d2462
commit
445fe61182
@ -0,0 +1,119 @@
|
|||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup as bs
|
||||||
|
import html2text
|
||||||
|
|
||||||
|
from lottery_data_scraper.schemas import GameSchema
|
||||||
|
from lottery_data_scraper.util import fetch_html
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
h = html2text.HTML2Text()
|
||||||
|
h.ignore_links = True
|
||||||
|
|
||||||
|
BASE = "https://www.idaholottery.com"
|
||||||
|
INDEX = "https://www.idaholottery.com/games/scratch"
|
||||||
|
|
||||||
|
def get_games(url):
|
||||||
|
html = fetch_html(url)
|
||||||
|
soup = bs(html, "lxml")
|
||||||
|
game_urls = [BASE + n.attrs["href"] for n in soup.select(".game__inner a.image-link")]
|
||||||
|
|
||||||
|
return game_urls
|
||||||
|
|
||||||
|
def parse_game(url):
|
||||||
|
game_html = fetch_html(url)
|
||||||
|
game_soup = bs(game_html, "lxml")
|
||||||
|
|
||||||
|
name = game_soup.select(".section-game h5")[0].text
|
||||||
|
|
||||||
|
image_url = game_soup.select(".section__image-holder img")[0].attrs["src"]
|
||||||
|
|
||||||
|
game_id = image_url.split("/")[-1].split("_")[0]
|
||||||
|
|
||||||
|
how_to_play = h.handle(str(game_soup.find(id="tab2")))
|
||||||
|
|
||||||
|
price_str = game_soup.select(".list-badgets h4")[1].text
|
||||||
|
price = float(price_str.replace("$", ""))
|
||||||
|
|
||||||
|
table = game_soup.find(class_="full-rules-and-odds")
|
||||||
|
rows_soup = table.tbody.find_all("tr")
|
||||||
|
grand_prize_soup = rows_soup[0]
|
||||||
|
total, prize, remaining, odds, _ = map(
|
||||||
|
lambda x: x.text.strip(), grand_prize_soup.find_all("td")
|
||||||
|
)
|
||||||
|
|
||||||
|
odds = int(odds.replace("1:", ""))
|
||||||
|
|
||||||
|
num_tx_initial = odds * int(total)
|
||||||
|
|
||||||
|
most_recent_percent_remaining = 1
|
||||||
|
|
||||||
|
prizes = []
|
||||||
|
for total, prize, remaining, odds, _ in [
|
||||||
|
map(lambda x: x.text.strip(), row.find_all("td")) for row in rows_soup
|
||||||
|
]:
|
||||||
|
# Their data is dirty. Here are some hacks to try and fix it.
|
||||||
|
# Sometimes, the total is missing.
|
||||||
|
# Try to guess it.
|
||||||
|
try:
|
||||||
|
total = int(total)
|
||||||
|
except ValueError:
|
||||||
|
total = int(int(remaining) / most_recent_percent_remaining)
|
||||||
|
|
||||||
|
value = float(prize.replace("$", "").replace(",", ""))
|
||||||
|
|
||||||
|
try:
|
||||||
|
remaining = int(remaining)
|
||||||
|
# Sometimes, the total is less than the remaining.
|
||||||
|
if total < remaining:
|
||||||
|
total = int(remaining / most_recent_percent_remaining)
|
||||||
|
most_recent_percent_remaining = remaining / total
|
||||||
|
except ValueError:
|
||||||
|
remaining = int(total * most_recent_percent_remaining)
|
||||||
|
|
||||||
|
# There is a typo in the $1 prize of $5x the cash.
|
||||||
|
if prize == "$1" and re.search(r"(?i)5x the cash", name):
|
||||||
|
total = 276000 # num tx / odds
|
||||||
|
remaining = total * most_recent_percent_remaining
|
||||||
|
|
||||||
|
prizes.append(
|
||||||
|
{
|
||||||
|
"prize": prize,
|
||||||
|
"available": remaining,
|
||||||
|
"claimed": total - remaining,
|
||||||
|
"value": value,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
game = {
|
||||||
|
"name": name,
|
||||||
|
"url": url,
|
||||||
|
"image_urls": f"[{image_url}]",
|
||||||
|
"state": "id",
|
||||||
|
"game_id": game_id,
|
||||||
|
"how_to_play": how_to_play,
|
||||||
|
"price": price,
|
||||||
|
"num_tx_initial": num_tx_initial,
|
||||||
|
"prizes": prizes
|
||||||
|
}
|
||||||
|
|
||||||
|
return game
|
||||||
|
|
||||||
|
def main():
|
||||||
|
game_urls = get_games(INDEX)
|
||||||
|
games = []
|
||||||
|
for url in game_urls:
|
||||||
|
try:
|
||||||
|
game = parse_game(url)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Unable to parse {}.\n{}".format(url, e))
|
||||||
|
games.append(game)
|
||||||
|
return games
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
games = main()
|
||||||
|
schema = GameSchema(many=True)
|
||||||
|
print(schema.dumps(games))
|
||||||
|
|
@ -0,0 +1,17 @@
|
|||||||
|
import unittest
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from lottery_data_scraper import idaho
|
||||||
|
from lottery_data_scraper import schemas
|
||||||
|
|
||||||
|
class TestIdaho(unittest.TestCase):
|
||||||
|
def test_parse_game_html(self):
|
||||||
|
# URL chosen arbitrarily
|
||||||
|
url = "https://www.idaholottery.com/games/scratch/lucky-rooster-bingo"
|
||||||
|
game = idaho.parse_game(url)
|
||||||
|
self.assertEqual(game['name'], "Lucky Rooster Bingo")
|
||||||
|
self.assertEqual(game["price"], 10)
|
||||||
|
self.assertEqual(game["game_id"], "1716")
|
||||||
|
self.assertEqual(game["prizes"][0]["prize"], "$100,000")
|
||||||
|
self.assertEqual(game["prizes"][0]["value"], 100000)
|
||||||
|
self.assertEqual(game["num_tx_initial"], 339900)
|
Loading…
Reference in New Issue