You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

128 lines
3.5 KiB
Python

import logging
import os
import re
import sys
import traceback
from xmlrpc import client
from bs4 import BeautifulSoup as bs
import html2text
import requests
from lottery_data_scraper.schemas import GameSchema
from lottery_data_scraper.util import fetch_html
logger = logging.getLogger(__name__)
h = html2text.HTML2Text()
h.ignore_links = True
BASE = "https://www.ctlottery.org"
INDEX = "https://ctlottery.org/ScratchGamesTable"
headers = {
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0",
"Referer": "https://www.ctlottery.org/ScratchGames",
}
def get_games_urls(url):
html = fetch_html(url)
soup = bs(html, "lxml")
table = soup.find("table")
game_hrefs = table.select("tr > td > a")
game_urls = list(map(lambda x: BASE + x.attrs["href"], game_hrefs))
return game_urls
def parse_game(game_url):
# Each game page has two tables
# Table 1: Ticket Price, Num_Tx_remaining, Odds
# Table 2: Prize Table
game_html = fetch_html(game_url)
game_soup = bs(game_html, "lxml")
name = game_soup.find("h2").text
game_id = re.match(r"GAME #(\d*)",game_soup.find(class_="heading-sub-info").text).group(1)
#soup for table 1
table_one = game_soup.find(class_="img-detail-block")
price = int(re.search(r"Ticket Price:\$(\d*)", table_one.text).group(1))
num_tx_str = re.search(r"Total # of Tickets:([\d*][,\d*]+)", table_one.text).group(1)
num_tx_initial = int(num_tx_str.replace(",", ""))
#soup for table 2
table_two = game_soup.find(class_="unclaimed-prize-wrap")
prize_rows = (
table_two.find("tbody").find_all("tr")
)
prizes = []
for row in prize_rows:
prize, total, available = [r.text for r in row.find_all("td")]
total = int(total.replace(",", ""))
available = int(available.replace(",", ""))
# one-off handlers...
if re.search(r"(?i)month.*for.*life", prize):
value = re.search(r"[\d,]+", prize).group()
value = float(value.replace(",", "")) * 20 * 12
elif re.search(r"(?i)$\d+ million", prize):
value = float(re.search(r"\d+").group()) * 1000000
else:
value = re.search(r"[\d,]+", prize).group()
value = float(value.replace("$", "").replace(",", ""))
prizes.append(
{
"prize": prize,
"value": value,
"claimed": total - available,
"available": available,
}
)
how_to_play_soup = game_soup.find(class_="play-text-wrap")
#remove heading and button tags
how_to_play_soup.h3.extract()
how_to_play_soup.a.extract()
how_to_play = h.handle(how_to_play_soup.text)
image_urls = BASE + game_soup.find(id="ticket_image").attrs["src"]
game = {
"state": "ct",
"game_id": game_id,
"name": name,
"price": price,
# Individual games are JavaScript links
"url": game_url,
"prizes": prizes,
"num_tx_initial": num_tx_initial,
"how_to_play": how_to_play,
"image_urls": image_urls
}
return game
def main():
games_urls = get_games_urls(INDEX)
games = []
for game in games_urls:
try:
game = parse_game(game)
except Exception as e:
logger.error("Unable to parse game {}.\n{}".format(game, e))
games.append(game)
return games
if __name__ == "__main__":
games = main()
schema = GameSchema(many=True)
print(schema.dumps(games))