Debugging image_urls validation error with save_image util

main
anela 2 years ago
parent 164d4bec0e
commit 1d4562f014

@ -5,15 +5,13 @@ from xmlrpc import client
import traceback
from bs4 import BeautifulSoup as bs
import requests
from lottery_data_scraper.schemas import GameSchema
from lottery_data_scraper.util import fetch_html
from lottery_data_scraper.util import save_image
from lotto_site_parsers.util import save_image
from lotto_site_parsers.util import save_game
logger = logging.getLogger(__name__)
DB_REPO_URI = os.environ.get("DB_REPO_URI", "http://localhost:8989")
BASE_URL = "https://www.nmlottery.com"
INDEX_URL = "https://www.nmlottery.com/games/scratchers"
HEADERS = {
@ -27,7 +25,7 @@ def get_games(site_url):
parses page for game ids and game info
returns and list of tuples with the id and game info for each game
"""
html = requests.get(site_url, headers=HEADERS).text
html = fetch_html(site_url)
soup = bs(html, "html.parser")
games_html = soup.find_all("div", class_="filter-block")
@ -81,7 +79,8 @@ def process_game(game_info):
num_of_tix = int(prizes[0]["odds"] * prizes[0]["total"])
image_url = game_html.find("div", class_="scratcher-image").find_next("img")["src"]
image_location = save_image("nm", game_id, image_url, headers=HEADERS)
# FIXME: "image_urls" currently NoneType and not passing GameSchema
# image_location = save_image("nm", game_id, image_url, headers=HEADERS)
game = {
"name": name,
@ -91,23 +90,26 @@ def process_game(game_info):
"prizes": prizes,
"num_tx_initial": num_of_tix,
"state": "nm",
"image_urls": '["{}"]'.format(image_location),
# "image_urls": '["{}"]'.format(image_url),
"image_urls": f'["{{image_url}}"]',
}
return game
def main():
final_games = []
games = get_games(INDEX_URL)
for game in games:
try:
game = process_game(game)
save_game(game)
final_games.append(game)
except Exception as e:
logger.warning(f"Unable to process game: {game[0]}-{game[1]}")
logger.warning(e)
traceback.print_exception(e)
if __name__ == "__main__":
main()
games = main()
schema = GameSchema(many=True)
print(schema.dumps(games))

@ -1,8 +1,13 @@
import logging
import base64
import os
import re
import requests
from tempfile import gettempdir
logger = logging.getLogger(__name__)
def fetch_html(url):
"""
Helper to fetch and cache html responses.
@ -44,3 +49,40 @@ def fetch_html(url):
with open(filepath, "w+") as f:
f.write(html)
return html
def save_image(state, filename, url, headers=None):
"""
Takes an abbreviates for a state, filename(game_id), url of image location, and headers
The function:
-parses the URL for the filetype
-establishes the image directory
-locates or create a filepath for images
-writes image info to file
"""
headers = headers or {}
extension = re.search(r"\.([^\.\?]*)($|[^\.]+$)", url).group(1)
IMAGE_DIR = os.getenv(
"IMAGE_DIR",
os.path.realpath(os.path.join(os.getenv("HOME"), ".data/assets/images")),
)
IMAGE_DIR = f"{IMAGE_DIR}/{state}"
dirpath = IMAGE_DIR
if not os.path.exists(dirpath):
os.makedirs(dirpath)
filename = f"{filename}.{extension}"
filepath = os.path.realpath(os.path.join(dirpath, filename))
try:
r = requests.get(url, stream=True, headers=headers)
except Exception as e:
logger.warn("Unable to download {}.\n{}".format(url, e))
return None
if r.status_code == 200:
with open(filepath, "wb") as f:
for chunk in r:
f.write(chunk)
else:
logger.warn("Unable to download {}. {} - {}".format(url, r.status_code, r))
return None
return "{}/{}".format(state, filename)

Loading…
Cancel
Save