moved fetch_html to util file

3 years ago · 8e11aae073
parent fdafaae267
commit 8e11aae073
2 changed files with 47 additions and 43 deletions
--- a/lottery_data_scraper/pennsylvania.py
+++ b/lottery_data_scraper/pennsylvania.py
@ -48,6 +48,7 @@ from tempfile import gettempdir
 from bs4 import BeautifulSoup as bs
 import requests
 from lottery_data_scraper.schemas import GameSchema
+from lottery_data_scraper.util import fetch_html

 logger = logging.getLogger(__name__)
 locale.setlocale(locale.LC_MONETARY, "en_US.UTF-8")
@ -58,49 +59,6 @@ BASE_URL = "https://www.palottery.state.pa.us"
 INDEX_URL = f"{BASE_URL}/Scratch-Offs/Active-Games.aspx"


-def fetch_html(url):
-    """
-    Helper to fetch and cache html responses.
-
-    During development and while testing, we'll be hitting the same urls often.
-    The content of the pages probably won't be changing.
-    Caching the results will speed up development,
-    and the servers will appreciate us for not spamming requests.
-
-    The responses are cached in the operating systems tempfile directory.
-    That's probably /tmp/ or /var/tmp/ on Unix flavors and C:/temp/ on Windows.
-    The filename is based on the URL. But since the URL might contain
-    characters that are invalid for filenames, we base64 encode the URL.
-    """
-    safe_filename = base64.urlsafe_b64encode(bytes(url, "utf-8")).decode("utf-8")
-    filepath = os.path.join(gettempdir(), safe_filename)
-
-    if os.path.isfile(filepath) and os.environ.get("USE_CACHE", False):
-        with open(filepath, "r") as f:
-            return f.read()
-    else:
-        # We are relying on the outside world when we make a request, so we
-        # might want to wrap this in a try/except. But we'd
-        # only want to do that in two cases.
-        #
-        # 1. We have a way of handling exceptions,
-        # A good example would be to catch exceptions and retry the
-        # request; maybe the network was down.
-        #
-        # 2. We can't handle the exception, but we want to log something
-        # more useful than the stack trace that will get spit out if
-        # we just let the exception go uncaught.
-        #
-        # In this case, I don't think it's worth muddying up the code
-        # trying to handle exceptions here. It's easy enough to just re-run
-        # the script.
-        html = requests.get(url).text
-        if os.environ.get("USE_CACHE", False):
-            with open(filepath, "w+") as f:
-                f.write(html)
-        return html
-
-
 def find_game_names(html):
    """
    Game names can be found on the index page
--- a/lottery_data_scraper/util.py
+++ b/lottery_data_scraper/util.py
@ -0,0 +1,46 @@
+import base64
+import os
+import requests
+from tempfile import gettempdir
+
+def fetch_html(url):
+    """
+    Helper to fetch and cache html responses.
+
+    During development and while testing, we'll be hitting the same urls often.
+    The content of the pages probably won't be changing.
+    Caching the results will speed up development,
+    and the servers will appreciate us for not spamming requests.
+
+    The responses are cached in the operating systems tempfile directory.
+    That's probably /tmp/ or /var/tmp/ on Unix flavors and C:/temp/ on Windows.
+    The filename is based on the URL. But since the URL might contain
+    characters that are invalid for filenames, we base64 encode the URL.
+    """
+    safe_filename = base64.urlsafe_b64encode(bytes(url, "utf-8")).decode("utf-8")
+    filepath = os.path.join(gettempdir(), safe_filename)
+
+    if os.path.isfile(filepath) and os.environ.get("USE_CACHE", False):
+        with open(filepath, "r") as f:
+            return f.read()
+    else:
+        # We are relying on the outside world when we make a request, so we
+        # might want to wrap this in a try/except. But we'd
+        # only want to do that in two cases.
+        #
+        # 1. We have a way of handling exceptions,
+        # A good example would be to catch exceptions and retry the
+        # request; maybe the network was down.
+        #
+        # 2. We can't handle the exception, but we want to log something
+        # more useful than the stack trace that will get spit out if
+        # we just let the exception go uncaught.
+        #
+        # In this case, I don't think it's worth muddying up the code
+        # trying to handle exceptions here. It's easy enough to just re-run
+        # the script.
+        html = requests.get(url).text
+        if os.environ.get("USE_CACHE", False):
+            with open(filepath, "w+") as f:
+                f.write(html)
+        return html