From 8e11aae073094cea903f4aba233ea54b0a8a1407 Mon Sep 17 00:00:00 2001 From: tdhood Date: Mon, 10 Apr 2023 18:50:17 -0700 Subject: [PATCH] moved fetch_html to util file --- lottery_data_scraper/pennsylvania.py | 44 +------------------------- lottery_data_scraper/util.py | 46 ++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 43 deletions(-) create mode 100644 lottery_data_scraper/util.py diff --git a/lottery_data_scraper/pennsylvania.py b/lottery_data_scraper/pennsylvania.py index 13ee267..f8b44e1 100644 --- a/lottery_data_scraper/pennsylvania.py +++ b/lottery_data_scraper/pennsylvania.py @@ -48,6 +48,7 @@ from tempfile import gettempdir from bs4 import BeautifulSoup as bs import requests from lottery_data_scraper.schemas import GameSchema +from lottery_data_scraper.util import fetch_html logger = logging.getLogger(__name__) locale.setlocale(locale.LC_MONETARY, "en_US.UTF-8") @@ -58,49 +59,6 @@ BASE_URL = "https://www.palottery.state.pa.us" INDEX_URL = f"{BASE_URL}/Scratch-Offs/Active-Games.aspx" -def fetch_html(url): - """ - Helper to fetch and cache html responses. - - During development and while testing, we'll be hitting the same urls often. - The content of the pages probably won't be changing. - Caching the results will speed up development, - and the servers will appreciate us for not spamming requests. - - The responses are cached in the operating systems tempfile directory. - That's probably /tmp/ or /var/tmp/ on Unix flavors and C:/temp/ on Windows. - The filename is based on the URL. But since the URL might contain - characters that are invalid for filenames, we base64 encode the URL. - """ - safe_filename = base64.urlsafe_b64encode(bytes(url, "utf-8")).decode("utf-8") - filepath = os.path.join(gettempdir(), safe_filename) - - if os.path.isfile(filepath) and os.environ.get("USE_CACHE", False): - with open(filepath, "r") as f: - return f.read() - else: - # We are relying on the outside world when we make a request, so we - # might want to wrap this in a try/except. But we'd - # only want to do that in two cases. - # - # 1. We have a way of handling exceptions, - # A good example would be to catch exceptions and retry the - # request; maybe the network was down. - # - # 2. We can't handle the exception, but we want to log something - # more useful than the stack trace that will get spit out if - # we just let the exception go uncaught. - # - # In this case, I don't think it's worth muddying up the code - # trying to handle exceptions here. It's easy enough to just re-run - # the script. - html = requests.get(url).text - if os.environ.get("USE_CACHE", False): - with open(filepath, "w+") as f: - f.write(html) - return html - - def find_game_names(html): """ Game names can be found on the index page diff --git a/lottery_data_scraper/util.py b/lottery_data_scraper/util.py new file mode 100644 index 0000000..a58400e --- /dev/null +++ b/lottery_data_scraper/util.py @@ -0,0 +1,46 @@ +import base64 +import os +import requests +from tempfile import gettempdir + +def fetch_html(url): + """ + Helper to fetch and cache html responses. + + During development and while testing, we'll be hitting the same urls often. + The content of the pages probably won't be changing. + Caching the results will speed up development, + and the servers will appreciate us for not spamming requests. + + The responses are cached in the operating systems tempfile directory. + That's probably /tmp/ or /var/tmp/ on Unix flavors and C:/temp/ on Windows. + The filename is based on the URL. But since the URL might contain + characters that are invalid for filenames, we base64 encode the URL. + """ + safe_filename = base64.urlsafe_b64encode(bytes(url, "utf-8")).decode("utf-8") + filepath = os.path.join(gettempdir(), safe_filename) + + if os.path.isfile(filepath) and os.environ.get("USE_CACHE", False): + with open(filepath, "r") as f: + return f.read() + else: + # We are relying on the outside world when we make a request, so we + # might want to wrap this in a try/except. But we'd + # only want to do that in two cases. + # + # 1. We have a way of handling exceptions, + # A good example would be to catch exceptions and retry the + # request; maybe the network was down. + # + # 2. We can't handle the exception, but we want to log something + # more useful than the stack trace that will get spit out if + # we just let the exception go uncaught. + # + # In this case, I don't think it's worth muddying up the code + # trying to handle exceptions here. It's easy enough to just re-run + # the script. + html = requests.get(url).text + if os.environ.get("USE_CACHE", False): + with open(filepath, "w+") as f: + f.write(html) + return html