diff --git a/lottery_data_scraper/new_mexico.py b/lottery_data_scraper/new_mexico.py new file mode 100644 index 0000000..b7ce12e --- /dev/null +++ b/lottery_data_scraper/new_mexico.py @@ -0,0 +1,113 @@ +import logging +import os +import re +from xmlrpc import client +import traceback + +from bs4 import BeautifulSoup as bs +from lottery_data_scraper.schemas import GameSchema +from lottery_data_scraper.util import fetch_html + + +logger = logging.getLogger(__name__) + +BASE_URL = "https://www.nmlottery.com" +INDEX_URL = "https://www.nmlottery.com/games/scratchers" +HEADERS = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0", +} + + +def get_games(site_url): + """ + Takes the URL from the scratcher site + parses page for game ids and game info + returns and list of tuples with the id and game info for each game + """ + html = fetch_html(site_url) + soup = bs(html, "html.parser") + + games_html = soup.find_all("div", class_="filter-block") + + ids = [ + re.search("\d+", id.text).group(0) + for id in soup.find_all("p", class_="game-number") + ] + + game_names = [name.text for name in soup.find_all("h3")] + + return list(zip(ids, game_names, games_html)) + + +def process_game(game_info): + """ + function takes game info: [game id, game_name, game_html_data] + + parses info to find specific game data + ex name, game_id, price, odds, prizes, how to play, image_url + + returns game object + """ + + game_html = game_info[2] + + name = game_info[1] + + game_id = game_info[0] + + price = float(game_html.find("p", class_="price").text.replace("$", "")) + + how_to_play = game_html.find("p", class_="how-to-play").find_next("span").text + + prizes = [ + { + "prize": row[0].strip(), + "value": price + if "prize ticket" in row[0].lower() + else float(row[0].replace("$", "").replace(",", "")), + "claimed": int(row[2].replace(",", "")) - int(row[3].replace(",", "")), + "available": int(row[3].replace(",", "")), + "total": int(row[2].replace(",", "")), + "odds": float(row[1].replace(",", "")), + } + for row in [ + row.text.split("\n")[1:-1] for row in game_html.table.find_all("tr")[1:] + ] + ] + + num_of_tix = int(prizes[0]["odds"] * prizes[0]["total"]) + + image_url = game_html.find("div", class_="scratcher-image").find_next("img")["src"] + + game = { + "name": name, + "game_id": game_id, + "price": price, + "how_to_play": how_to_play, + "prizes": prizes, + "num_tx_initial": num_of_tix, + "state": "nm", + "image_urls": f'["{image_url}"]', + } + + return game + + +def main(): + final_games = [] + games = get_games(INDEX_URL) + for game in games: + try: + game = process_game(game) + final_games.append(game) + except Exception as e: + logger.warning(f"Unable to process game: {game[0]}-{game[1]}") + logger.warning(e) + traceback.print_exception(e) + return final_games + + +if __name__ == "__main__": + games = main() + schema = GameSchema(many=True) + print(schema.dumps(games)) diff --git a/lottery_data_scraper/util.py b/lottery_data_scraper/util.py index a58400e..1b05ab2 100644 --- a/lottery_data_scraper/util.py +++ b/lottery_data_scraper/util.py @@ -3,6 +3,7 @@ import os import requests from tempfile import gettempdir + def fetch_html(url): """ Helper to fetch and cache html responses.