From 1d4562f014dbee9fb24b2197233af2cae9bd3a55 Mon Sep 17 00:00:00 2001 From: anela Date: Fri, 21 Apr 2023 11:20:40 -0700 Subject: [PATCH] Debugging image_urls validation error with save_image util --- .../new_mexico.py | 24 ++++++----- lottery_data_scraper/util.py | 42 +++++++++++++++++++ 2 files changed, 55 insertions(+), 11 deletions(-) rename new_mexico.py => lottery_data_scraper/new_mexico.py (82%) diff --git a/new_mexico.py b/lottery_data_scraper/new_mexico.py similarity index 82% rename from new_mexico.py rename to lottery_data_scraper/new_mexico.py index 1313be5..924db38 100644 --- a/new_mexico.py +++ b/lottery_data_scraper/new_mexico.py @@ -5,15 +5,13 @@ from xmlrpc import client import traceback from bs4 import BeautifulSoup as bs -import requests +from lottery_data_scraper.schemas import GameSchema +from lottery_data_scraper.util import fetch_html +from lottery_data_scraper.util import save_image -from lotto_site_parsers.util import save_image -from lotto_site_parsers.util import save_game - logger = logging.getLogger(__name__) -DB_REPO_URI = os.environ.get("DB_REPO_URI", "http://localhost:8989") BASE_URL = "https://www.nmlottery.com" INDEX_URL = "https://www.nmlottery.com/games/scratchers" HEADERS = { @@ -27,7 +25,7 @@ def get_games(site_url): parses page for game ids and game info returns and list of tuples with the id and game info for each game """ - html = requests.get(site_url, headers=HEADERS).text + html = fetch_html(site_url) soup = bs(html, "html.parser") games_html = soup.find_all("div", class_="filter-block") @@ -81,7 +79,8 @@ def process_game(game_info): num_of_tix = int(prizes[0]["odds"] * prizes[0]["total"]) image_url = game_html.find("div", class_="scratcher-image").find_next("img")["src"] - image_location = save_image("nm", game_id, image_url, headers=HEADERS) + # FIXME: "image_urls" currently NoneType and not passing GameSchema + # image_location = save_image("nm", game_id, image_url, headers=HEADERS) game = { "name": name, @@ -91,23 +90,26 @@ def process_game(game_info): "prizes": prizes, "num_tx_initial": num_of_tix, "state": "nm", - "image_urls": '["{}"]'.format(image_location), + # "image_urls": '["{}"]'.format(image_url), + "image_urls": f'["{{image_url}}"]', } return game def main(): + final_games = [] games = get_games(INDEX_URL) for game in games: try: game = process_game(game) - save_game(game) + final_games.append(game) except Exception as e: logger.warning(f"Unable to process game: {game[0]}-{game[1]}") logger.warning(e) traceback.print_exception(e) - if __name__ == "__main__": - main() + games = main() + schema = GameSchema(many=True) + print(schema.dumps(games)) \ No newline at end of file diff --git a/lottery_data_scraper/util.py b/lottery_data_scraper/util.py index a58400e..cf7c4c1 100644 --- a/lottery_data_scraper/util.py +++ b/lottery_data_scraper/util.py @@ -1,8 +1,13 @@ +import logging import base64 import os +import re import requests from tempfile import gettempdir + +logger = logging.getLogger(__name__) + def fetch_html(url): """ Helper to fetch and cache html responses. @@ -44,3 +49,40 @@ def fetch_html(url): with open(filepath, "w+") as f: f.write(html) return html + + +def save_image(state, filename, url, headers=None): + """ + Takes an abbreviates for a state, filename(game_id), url of image location, and headers + + The function: + -parses the URL for the filetype + -establishes the image directory + -locates or create a filepath for images + -writes image info to file + """ + headers = headers or {} + extension = re.search(r"\.([^\.\?]*)($|[^\.]+$)", url).group(1) + IMAGE_DIR = os.getenv( + "IMAGE_DIR", + os.path.realpath(os.path.join(os.getenv("HOME"), ".data/assets/images")), + ) + IMAGE_DIR = f"{IMAGE_DIR}/{state}" + dirpath = IMAGE_DIR + if not os.path.exists(dirpath): + os.makedirs(dirpath) + filename = f"{filename}.{extension}" + filepath = os.path.realpath(os.path.join(dirpath, filename)) + try: + r = requests.get(url, stream=True, headers=headers) + except Exception as e: + logger.warn("Unable to download {}.\n{}".format(url, e)) + return None + if r.status_code == 200: + with open(filepath, "wb") as f: + for chunk in r: + f.write(chunk) + else: + logger.warn("Unable to download {}. {} - {}".format(url, r.status_code, r)) + return None + return "{}/{}".format(state, filename)