Merge branch 'main' of https://github.com/owogawc/lottery_data_scraper

adding new mexico to main
3 years ago · b64fe3a307
parent c99c961d56 29d9607bb6
commit b64fe3a307
2 changed files with 114 additions and 0 deletions
--- a/lottery_data_scraper/new_mexico.py
+++ b/lottery_data_scraper/new_mexico.py
@ -0,0 +1,113 @@
 import logging
 import os
 import re
 from xmlrpc import client
 import traceback
 from bs4 import BeautifulSoup as bs
 from lottery_data_scraper.schemas import GameSchema
 from lottery_data_scraper.util import fetch_html
 logger = logging.getLogger(__name__)
 BASE_URL = "https://www.nmlottery.com"
 INDEX_URL = "https://www.nmlottery.com/games/scratchers"
 HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0",
 }
 def get_games(site_url):
    """
    Takes the URL from the scratcher site
    parses page for game ids and game info
    returns and list of tuples with the id and game info for each game
    """
    html = fetch_html(site_url)
    soup = bs(html, "html.parser")
    games_html = soup.find_all("div", class_="filter-block")
    ids = [
        re.search("\d+", id.text).group(0)
        for id in soup.find_all("p", class_="game-number")
    ]
    game_names = [name.text for name in soup.find_all("h3")]
    return list(zip(ids, game_names, games_html))
 def process_game(game_info):
    """
    function takes game info: [game id, game_name, game_html_data]
    parses info to find specific game data
    ex name, game_id, price, odds, prizes, how to play, image_url
    returns game object
    """
    game_html = game_info[2]
    name = game_info[1]
    game_id = game_info[0]
    price = float(game_html.find("p", class_="price").text.replace("$", ""))
    how_to_play = game_html.find("p", class_="how-to-play").find_next("span").text
    prizes = [
        {
            "prize": row[0].strip(),
            "value": price
            if "prize ticket" in row[0].lower()
            else float(row[0].replace("$", "").replace(",", "")),
            "claimed": int(row[2].replace(",", "")) - int(row[3].replace(",", "")),
            "available": int(row[3].replace(",", "")),
            "total": int(row[2].replace(",", "")),
            "odds": float(row[1].replace(",", "")),
        }
        for row in [
            row.text.split("\n")[1:-1] for row in game_html.table.find_all("tr")[1:]
        ]
    ]
    num_of_tix = int(prizes[0]["odds"] * prizes[0]["total"])
    image_url = game_html.find("div", class_="scratcher-image").find_next("img")["src"]
    game = {
        "name": name,
        "game_id": game_id,
        "price": price,
        "how_to_play": how_to_play,
        "prizes": prizes,
        "num_tx_initial": num_of_tix,
        "state": "nm",
        "image_urls": f'["{image_url}"]',
    }
    return game
 def main():
    final_games = []
    games = get_games(INDEX_URL)
    for game in games:
        try:
            game = process_game(game)
            final_games.append(game)
        except Exception as e:
            logger.warning(f"Unable to process game: {game[0]}-{game[1]}")
            logger.warning(e)
            traceback.print_exception(e)
    return final_games
 if __name__ == "__main__":
    games = main()
    schema = GameSchema(many=True)
    print(schema.dumps(games))
--- a/lottery_data_scraper/util.py
+++ b/lottery_data_scraper/util.py
@ -3,6 +3,7 @@ import os
 import requests
 from tempfile import gettempdir
 def fetch_html(url):
    """
    Helper to fetch and cache html responses.