added parser and tests for idaho
parent
904a5d2462
commit
445fe61182
@ -0,0 +1,119 @@
|
||||
import logging
|
||||
import re
|
||||
|
||||
from bs4 import BeautifulSoup as bs
|
||||
import html2text
|
||||
|
||||
from lottery_data_scraper.schemas import GameSchema
|
||||
from lottery_data_scraper.util import fetch_html
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
h = html2text.HTML2Text()
|
||||
h.ignore_links = True
|
||||
|
||||
BASE = "https://www.idaholottery.com"
|
||||
INDEX = "https://www.idaholottery.com/games/scratch"
|
||||
|
||||
def get_games(url):
|
||||
html = fetch_html(url)
|
||||
soup = bs(html, "lxml")
|
||||
game_urls = [BASE + n.attrs["href"] for n in soup.select(".game__inner a.image-link")]
|
||||
|
||||
return game_urls
|
||||
|
||||
def parse_game(url):
|
||||
game_html = fetch_html(url)
|
||||
game_soup = bs(game_html, "lxml")
|
||||
|
||||
name = game_soup.select(".section-game h5")[0].text
|
||||
|
||||
image_url = game_soup.select(".section__image-holder img")[0].attrs["src"]
|
||||
|
||||
game_id = image_url.split("/")[-1].split("_")[0]
|
||||
|
||||
how_to_play = h.handle(str(game_soup.find(id="tab2")))
|
||||
|
||||
price_str = game_soup.select(".list-badgets h4")[1].text
|
||||
price = float(price_str.replace("$", ""))
|
||||
|
||||
table = game_soup.find(class_="full-rules-and-odds")
|
||||
rows_soup = table.tbody.find_all("tr")
|
||||
grand_prize_soup = rows_soup[0]
|
||||
total, prize, remaining, odds, _ = map(
|
||||
lambda x: x.text.strip(), grand_prize_soup.find_all("td")
|
||||
)
|
||||
|
||||
odds = int(odds.replace("1:", ""))
|
||||
|
||||
num_tx_initial = odds * int(total)
|
||||
|
||||
most_recent_percent_remaining = 1
|
||||
|
||||
prizes = []
|
||||
for total, prize, remaining, odds, _ in [
|
||||
map(lambda x: x.text.strip(), row.find_all("td")) for row in rows_soup
|
||||
]:
|
||||
# Their data is dirty. Here are some hacks to try and fix it.
|
||||
# Sometimes, the total is missing.
|
||||
# Try to guess it.
|
||||
try:
|
||||
total = int(total)
|
||||
except ValueError:
|
||||
total = int(int(remaining) / most_recent_percent_remaining)
|
||||
|
||||
value = float(prize.replace("$", "").replace(",", ""))
|
||||
|
||||
try:
|
||||
remaining = int(remaining)
|
||||
# Sometimes, the total is less than the remaining.
|
||||
if total < remaining:
|
||||
total = int(remaining / most_recent_percent_remaining)
|
||||
most_recent_percent_remaining = remaining / total
|
||||
except ValueError:
|
||||
remaining = int(total * most_recent_percent_remaining)
|
||||
|
||||
# There is a typo in the $1 prize of $5x the cash.
|
||||
if prize == "$1" and re.search(r"(?i)5x the cash", name):
|
||||
total = 276000 # num tx / odds
|
||||
remaining = total * most_recent_percent_remaining
|
||||
|
||||
prizes.append(
|
||||
{
|
||||
"prize": prize,
|
||||
"available": remaining,
|
||||
"claimed": total - remaining,
|
||||
"value": value,
|
||||
}
|
||||
)
|
||||
|
||||
game = {
|
||||
"name": name,
|
||||
"url": url,
|
||||
"image_urls": f"[{image_url}]",
|
||||
"state": "id",
|
||||
"game_id": game_id,
|
||||
"how_to_play": how_to_play,
|
||||
"price": price,
|
||||
"num_tx_initial": num_tx_initial,
|
||||
"prizes": prizes
|
||||
}
|
||||
|
||||
return game
|
||||
|
||||
def main():
|
||||
game_urls = get_games(INDEX)
|
||||
games = []
|
||||
for url in game_urls:
|
||||
try:
|
||||
game = parse_game(url)
|
||||
except Exception as e:
|
||||
logger.error("Unable to parse {}.\n{}".format(url, e))
|
||||
games.append(game)
|
||||
return games
|
||||
|
||||
if __name__ == "__main__":
|
||||
games = main()
|
||||
schema = GameSchema(many=True)
|
||||
print(schema.dumps(games))
|
||||
|
@ -0,0 +1,17 @@
|
||||
import unittest
|
||||
import requests
|
||||
|
||||
from lottery_data_scraper import idaho
|
||||
from lottery_data_scraper import schemas
|
||||
|
||||
class TestIdaho(unittest.TestCase):
|
||||
def test_parse_game_html(self):
|
||||
# URL chosen arbitrarily
|
||||
url = "https://www.idaholottery.com/games/scratch/lucky-rooster-bingo"
|
||||
game = idaho.parse_game(url)
|
||||
self.assertEqual(game['name'], "Lucky Rooster Bingo")
|
||||
self.assertEqual(game["price"], 10)
|
||||
self.assertEqual(game["game_id"], "1716")
|
||||
self.assertEqual(game["prizes"][0]["prize"], "$100,000")
|
||||
self.assertEqual(game["prizes"][0]["value"], 100000)
|
||||
self.assertEqual(game["num_tx_initial"], 339900)
|
Loading…
Reference in New Issue