adding connecticut parser and test

3 years ago · 37c40360c2
parent 904a5d2462
commit 37c40360c2
2 changed files with 144 additions and 0 deletions
--- a/lottery_data_scraper/connecticut.py
+++ b/lottery_data_scraper/connecticut.py
@ -0,0 +1,127 @@
 import logging
 import os
 import re
 import sys
 import traceback
 from xmlrpc import client
 from bs4 import BeautifulSoup as bs
 import html2text
 import requests
 from lottery_data_scraper.schemas import GameSchema 
 from lottery_data_scraper.util import fetch_html
 logger = logging.getLogger(__name__)
 h = html2text.HTML2Text()
 h.ignore_links = True
 BASE = "https://www.ctlottery.org"
 INDEX = "https://ctlottery.org/ScratchGamesTable"
 headers = {
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0",
    "Referer": "https://www.ctlottery.org/ScratchGames",
 }
 def get_games_urls(url):
    html = fetch_html(url)
    soup = bs(html, "lxml")
    table = soup.find("table")
    game_hrefs = table.select("tr > td > a")
    game_urls = list(map(lambda x: BASE + x.attrs["href"], game_hrefs))
    return game_urls
 def parse_game(game_url):
    # Each game page has two tables
    #   Table 1: Ticket Price, Num_Tx_remaining, Odds
    #   Table 2: Prize Table
    game_html = fetch_html(game_url)
    game_soup = bs(game_html, "lxml")
    name = game_soup.find("h2").text
    game_id = re.match(r"GAME #(\d*)",game_soup.find(class_="heading-sub-info").text).group(1)
    #soup for table 1
    table_one = game_soup.find(class_="img-detail-block")
    price = int(re.search(r"Ticket Price:\$(\d*)", table_one.text).group(1))
    num_tx_str = re.search(r"Total # of Tickets:([\d*][,\d*]+)", table_one.text).group(1)
    num_tx_initial = int(num_tx_str.replace(",", ""))
    #soup for table 2
    table_two = game_soup.find(class_="unclaimed-prize-wrap")
    prize_rows = (
     table_two.find("tbody").find_all("tr")
    )
    prizes = []
    for row in prize_rows:
        prize, total, available = [r.text for r in row.find_all("td")]
        total = int(total.replace(",", ""))
        available = int(available.replace(",", ""))
        # one-off handlers...
        if re.search(r"(?i)month.*for.*life", prize):
            value = re.search(r"[\d,]+", prize).group()
            value = float(value.replace(",", "")) * 20 * 12
        elif re.search(r"(?i)$\d+ million", prize):
            value = float(re.search(r"\d+").group()) * 1000000
        else:
            value = re.search(r"[\d,]+", prize).group()
            value = float(value.replace("$", "").replace(",", ""))
        prizes.append(
            {
                "prize": prize,
                "value": value,
                "claimed": total - available,
                "available": available,
            }
        )
    how_to_play_soup = game_soup.find(class_="play-text-wrap")
    #remove heading and button tags
    how_to_play_soup.h3.extract()
    how_to_play_soup.a.extract()
    how_to_play = h.handle(how_to_play_soup.text)
    image_urls = BASE + game_soup.find(id="ticket_image").attrs["src"]
    game = {
        "state": "ct",
        "game_id": game_id,
        "name": name,
        "price": price,
        # Individual games are JavaScript links
        "url": game_url,
        "prizes": prizes,
        "num_tx_initial": num_tx_initial,
        "how_to_play": how_to_play,
        "image_urls": image_urls
    }
    return game
 def main():
    games_urls = get_games_urls(INDEX)
    games = []
    for game in games_urls:
        try:
            game = parse_game(game)
        except Exception as e:
            logger.error("Unable to parse game {}.\n{}".format(game, e))
    games.append(game)
    return games
 if __name__ == "__main__":
    games = main()
    schema = GameSchema(many=True)
    print(schema.dumps(games))
--- a/tests/test_connecticut.py
+++ b/tests/test_connecticut.py
@ -0,0 +1,17 @@
 import unittest
 import requests
 from lottery_data_scraper import connecticut
 from lottery_data_scraper import schemas
 class TestConnecticut(unittest.TestCase):
    def test_parse_game_html(self):
        # URL chosen arbitrarily
        url = 'https://www.ctlottery.org/ScratchGames/1740/'
        game = connecticut.parse_game(url)
        self.assertEqual(game['name'], 'Extreme Green')
        self.assertEqual(game["price"], 10)
        self.assertEqual(game["game_id"], "1740")
        self.assertEqual(game["prizes"][0]["prize"], "$100,000")
        self.assertEqual(game["prizes"][0]["value"], 100000)
        self.assertEqual(game["num_tx_initial"], 2230800)