moved fetch_html to util file

3 years ago · 8e11aae073
parent fdafaae267
commit 8e11aae073
2 changed files with 47 additions and 43 deletions
--- a/lottery_data_scraper/pennsylvania.py
+++ b/lottery_data_scraper/pennsylvania.py
@ -48,6 +48,7 @@ from tempfile import gettempdir
 from bs4 import BeautifulSoup as bs
 import requests
 from lottery_data_scraper.schemas import GameSchema
 from lottery_data_scraper.util import fetch_html
 logger = logging.getLogger(__name__)
 locale.setlocale(locale.LC_MONETARY, "en_US.UTF-8")
@ -58,49 +59,6 @@ BASE_URL = "https://www.palottery.state.pa.us"
 INDEX_URL = f"{BASE_URL}/Scratch-Offs/Active-Games.aspx"
 def fetch_html(url):
    """
    Helper to fetch and cache html responses.
    During development and while testing, we'll be hitting the same urls often.
    The content of the pages probably won't be changing.
    Caching the results will speed up development,
    and the servers will appreciate us for not spamming requests.
    The responses are cached in the operating systems tempfile directory.
    That's probably /tmp/ or /var/tmp/ on Unix flavors and C:/temp/ on Windows.
    The filename is based on the URL. But since the URL might contain
    characters that are invalid for filenames, we base64 encode the URL.
    """
    safe_filename = base64.urlsafe_b64encode(bytes(url, "utf-8")).decode("utf-8")
    filepath = os.path.join(gettempdir(), safe_filename)
    if os.path.isfile(filepath) and os.environ.get("USE_CACHE", False):
        with open(filepath, "r") as f:
            return f.read()
    else:
        # We are relying on the outside world when we make a request, so we
        # might want to wrap this in a try/except. But we'd
        # only want to do that in two cases.
        #
        # 1. We have a way of handling exceptions,
        # A good example would be to catch exceptions and retry the
        # request; maybe the network was down.
        #
        # 2. We can't handle the exception, but we want to log something
        # more useful than the stack trace that will get spit out if
        # we just let the exception go uncaught.
        #
        # In this case, I don't think it's worth muddying up the code
        # trying to handle exceptions here. It's easy enough to just re-run
        # the script.
        html = requests.get(url).text
        if os.environ.get("USE_CACHE", False):
            with open(filepath, "w+") as f:
                f.write(html)
        return html
 def find_game_names(html):
    """
    Game names can be found on the index page
--- a/lottery_data_scraper/util.py
+++ b/lottery_data_scraper/util.py
@ -0,0 +1,46 @@
 import base64
 import os
 import requests
 from tempfile import gettempdir
 def fetch_html(url):
    """
    Helper to fetch and cache html responses.
    During development and while testing, we'll be hitting the same urls often.
    The content of the pages probably won't be changing.
    Caching the results will speed up development,
    and the servers will appreciate us for not spamming requests.
    The responses are cached in the operating systems tempfile directory.
    That's probably /tmp/ or /var/tmp/ on Unix flavors and C:/temp/ on Windows.
    The filename is based on the URL. But since the URL might contain
    characters that are invalid for filenames, we base64 encode the URL.
    """
    safe_filename = base64.urlsafe_b64encode(bytes(url, "utf-8")).decode("utf-8")
    filepath = os.path.join(gettempdir(), safe_filename)
    if os.path.isfile(filepath) and os.environ.get("USE_CACHE", False):
        with open(filepath, "r") as f:
            return f.read()
    else:
        # We are relying on the outside world when we make a request, so we
        # might want to wrap this in a try/except. But we'd
        # only want to do that in two cases.
        #
        # 1. We have a way of handling exceptions,
        # A good example would be to catch exceptions and retry the
        # request; maybe the network was down.
        #
        # 2. We can't handle the exception, but we want to log something
        # more useful than the stack trace that will get spit out if
        # we just let the exception go uncaught.
        #
        # In this case, I don't think it's worth muddying up the code
        # trying to handle exceptions here. It's easy enough to just re-run
        # the script.
        html = requests.get(url).text
        if os.environ.get("USE_CACHE", False):
            with open(filepath, "w+") as f:
                f.write(html)
        return html