commit
b33f80b331
@ -0,0 +1,124 @@
|
||||
import logging
|
||||
import re
|
||||
|
||||
from bs4 import BeautifulSoup as bs
|
||||
import html2text
|
||||
from lottery_data_scraper.schemas import GameSchema
|
||||
from lottery_data_scraper.util import fetch_html
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
h = html2text.HTML2Text()
|
||||
h.ignore_links = True
|
||||
|
||||
BASE = "https://www.ctlottery.org"
|
||||
|
||||
INDEX = "https://ctlottery.org/ScratchGamesTable"
|
||||
|
||||
|
||||
headers = {
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0",
|
||||
"Referer": "https://www.ctlottery.org/ScratchGames",
|
||||
}
|
||||
|
||||
|
||||
def get_games_urls(url):
|
||||
html = fetch_html(url)
|
||||
soup = bs(html, "lxml")
|
||||
table = soup.find("table")
|
||||
game_hrefs = table.select("tr > td > a")
|
||||
game_urls = list(map(lambda x: BASE + x.attrs["href"], game_hrefs))
|
||||
return game_urls
|
||||
|
||||
|
||||
def parse_game(game_url):
|
||||
# Each game page has two tables
|
||||
# Table 1: Ticket Price, Num_Tx_remaining, Odds
|
||||
# Table 2: Prize Table
|
||||
|
||||
game_html = fetch_html(game_url)
|
||||
game_soup = bs(game_html, "lxml")
|
||||
|
||||
name = game_soup.find("h2").text
|
||||
game_id = re.match(
|
||||
r"GAME #(\d*)", game_soup.find(class_="heading-sub-info").text
|
||||
).group(1)
|
||||
|
||||
# soup for table 1
|
||||
table_one = game_soup.find(class_="img-detail-block")
|
||||
|
||||
price = int(re.search(r"Ticket Price:\$(\d*)", table_one.text).group(1))
|
||||
|
||||
num_tx_str = re.search(r"Total # of Tickets:([\d*][,\d*]+)", table_one.text).group(
|
||||
1
|
||||
)
|
||||
num_tx_initial = int(num_tx_str.replace(",", ""))
|
||||
|
||||
# soup for table 2
|
||||
table_two = game_soup.find(class_="unclaimed-prize-wrap")
|
||||
prize_rows = table_two.find("tbody").find_all("tr")
|
||||
prizes = []
|
||||
for row in prize_rows:
|
||||
prize, total, available = [r.text for r in row.find_all("td")]
|
||||
total = int(total.replace(",", ""))
|
||||
available = int(available.replace(",", ""))
|
||||
# one-off handlers...
|
||||
if re.search(r"(?i)month.*for.*life", prize):
|
||||
value = re.search(r"[\d,]+", prize).group()
|
||||
value = float(value.replace(",", "")) * 20 * 12
|
||||
elif re.search(r"(?i)$\d+ million", prize):
|
||||
value = float(re.search(r"\d+").group()) * 1000000
|
||||
else:
|
||||
value = re.search(r"[\d,]+", prize).group()
|
||||
value = float(value.replace("$", "").replace(",", ""))
|
||||
prizes.append(
|
||||
{
|
||||
"prize": prize,
|
||||
"value": value,
|
||||
"claimed": total - available,
|
||||
"available": available,
|
||||
}
|
||||
)
|
||||
|
||||
how_to_play_soup = game_soup.find(class_="play-text-wrap")
|
||||
# remove heading and button tags
|
||||
how_to_play_soup.h3.extract()
|
||||
how_to_play_soup.a.extract()
|
||||
|
||||
how_to_play = h.handle(how_to_play_soup.text)
|
||||
|
||||
image_urls = BASE + game_soup.find(id="ticket_image").attrs["src"]
|
||||
|
||||
game = {
|
||||
"state": "ct",
|
||||
"game_id": game_id,
|
||||
"name": name,
|
||||
"price": price,
|
||||
# Individual games are JavaScript links
|
||||
"url": game_url,
|
||||
"prizes": prizes,
|
||||
"num_tx_initial": num_tx_initial,
|
||||
"how_to_play": how_to_play,
|
||||
"image_urls": [image_urls],
|
||||
}
|
||||
return game
|
||||
|
||||
|
||||
def main():
|
||||
games_urls = get_games_urls(INDEX)
|
||||
games = []
|
||||
for game in games_urls:
|
||||
try:
|
||||
game = parse_game(game)
|
||||
except Exception as e:
|
||||
logger.error("Unable to parse game {}.\n{}".format(game, e))
|
||||
continue
|
||||
games.append(game)
|
||||
return games
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
games = main()
|
||||
schema = GameSchema(many=True)
|
||||
print(schema.dumps(games))
|
@ -0,0 +1,17 @@
|
||||
import unittest
|
||||
import requests
|
||||
|
||||
from lottery_data_scraper import connecticut
|
||||
from lottery_data_scraper import schemas
|
||||
|
||||
class TestConnecticut(unittest.TestCase):
|
||||
def test_parse_game_html(self):
|
||||
# URL chosen arbitrarily
|
||||
url = 'https://www.ctlottery.org/ScratchGames/1740/'
|
||||
game = connecticut.parse_game(url)
|
||||
self.assertEqual(game['name'], 'Extreme Green')
|
||||
self.assertEqual(game["price"], 10)
|
||||
self.assertEqual(game["game_id"], "1740")
|
||||
self.assertEqual(game["prizes"][0]["prize"], "$100,000")
|
||||
self.assertEqual(game["prizes"][0]["value"], 100000)
|
||||
self.assertEqual(game["num_tx_initial"], 2230800)
|
Loading…
Reference in New Issue