Debugging image_urls validation error with save_image util

main
anela 1 year ago
parent 164d4bec0e
commit 1d4562f014

@ -5,15 +5,13 @@ from xmlrpc import client
import traceback import traceback
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
import requests from lottery_data_scraper.schemas import GameSchema
from lottery_data_scraper.util import fetch_html
from lottery_data_scraper.util import save_image
from lotto_site_parsers.util import save_image
from lotto_site_parsers.util import save_game
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DB_REPO_URI = os.environ.get("DB_REPO_URI", "http://localhost:8989")
BASE_URL = "https://www.nmlottery.com" BASE_URL = "https://www.nmlottery.com"
INDEX_URL = "https://www.nmlottery.com/games/scratchers" INDEX_URL = "https://www.nmlottery.com/games/scratchers"
HEADERS = { HEADERS = {
@ -27,7 +25,7 @@ def get_games(site_url):
parses page for game ids and game info parses page for game ids and game info
returns and list of tuples with the id and game info for each game returns and list of tuples with the id and game info for each game
""" """
html = requests.get(site_url, headers=HEADERS).text html = fetch_html(site_url)
soup = bs(html, "html.parser") soup = bs(html, "html.parser")
games_html = soup.find_all("div", class_="filter-block") games_html = soup.find_all("div", class_="filter-block")
@ -81,7 +79,8 @@ def process_game(game_info):
num_of_tix = int(prizes[0]["odds"] * prizes[0]["total"]) num_of_tix = int(prizes[0]["odds"] * prizes[0]["total"])
image_url = game_html.find("div", class_="scratcher-image").find_next("img")["src"] image_url = game_html.find("div", class_="scratcher-image").find_next("img")["src"]
image_location = save_image("nm", game_id, image_url, headers=HEADERS) # FIXME: "image_urls" currently NoneType and not passing GameSchema
# image_location = save_image("nm", game_id, image_url, headers=HEADERS)
game = { game = {
"name": name, "name": name,
@ -91,23 +90,26 @@ def process_game(game_info):
"prizes": prizes, "prizes": prizes,
"num_tx_initial": num_of_tix, "num_tx_initial": num_of_tix,
"state": "nm", "state": "nm",
"image_urls": '["{}"]'.format(image_location), # "image_urls": '["{}"]'.format(image_url),
"image_urls": f'["{{image_url}}"]',
} }
return game return game
def main(): def main():
final_games = []
games = get_games(INDEX_URL) games = get_games(INDEX_URL)
for game in games: for game in games:
try: try:
game = process_game(game) game = process_game(game)
save_game(game) final_games.append(game)
except Exception as e: except Exception as e:
logger.warning(f"Unable to process game: {game[0]}-{game[1]}") logger.warning(f"Unable to process game: {game[0]}-{game[1]}")
logger.warning(e) logger.warning(e)
traceback.print_exception(e) traceback.print_exception(e)
if __name__ == "__main__": if __name__ == "__main__":
main() games = main()
schema = GameSchema(many=True)
print(schema.dumps(games))

@ -1,8 +1,13 @@
import logging
import base64 import base64
import os import os
import re
import requests import requests
from tempfile import gettempdir from tempfile import gettempdir
logger = logging.getLogger(__name__)
def fetch_html(url): def fetch_html(url):
""" """
Helper to fetch and cache html responses. Helper to fetch and cache html responses.
@ -44,3 +49,40 @@ def fetch_html(url):
with open(filepath, "w+") as f: with open(filepath, "w+") as f:
f.write(html) f.write(html)
return html return html
def save_image(state, filename, url, headers=None):
"""
Takes an abbreviates for a state, filename(game_id), url of image location, and headers
The function:
-parses the URL for the filetype
-establishes the image directory
-locates or create a filepath for images
-writes image info to file
"""
headers = headers or {}
extension = re.search(r"\.([^\.\?]*)($|[^\.]+$)", url).group(1)
IMAGE_DIR = os.getenv(
"IMAGE_DIR",
os.path.realpath(os.path.join(os.getenv("HOME"), ".data/assets/images")),
)
IMAGE_DIR = f"{IMAGE_DIR}/{state}"
dirpath = IMAGE_DIR
if not os.path.exists(dirpath):
os.makedirs(dirpath)
filename = f"{filename}.{extension}"
filepath = os.path.realpath(os.path.join(dirpath, filename))
try:
r = requests.get(url, stream=True, headers=headers)
except Exception as e:
logger.warn("Unable to download {}.\n{}".format(url, e))
return None
if r.status_code == 200:
with open(filepath, "wb") as f:
for chunk in r:
f.write(chunk)
else:
logger.warn("Unable to download {}. {} - {}".format(url, r.status_code, r))
return None
return "{}/{}".format(state, filename)

Loading…
Cancel
Save