moved fetch_html to util file

main
Taylor Hood 2 years ago
parent fdafaae267
commit 8e11aae073

@ -48,6 +48,7 @@ from tempfile import gettempdir
from bs4 import BeautifulSoup as bs
import requests
from lottery_data_scraper.schemas import GameSchema
from lottery_data_scraper.util import fetch_html
logger = logging.getLogger(__name__)
locale.setlocale(locale.LC_MONETARY, "en_US.UTF-8")
@ -58,49 +59,6 @@ BASE_URL = "https://www.palottery.state.pa.us"
INDEX_URL = f"{BASE_URL}/Scratch-Offs/Active-Games.aspx"
def fetch_html(url):
"""
Helper to fetch and cache html responses.
During development and while testing, we'll be hitting the same urls often.
The content of the pages probably won't be changing.
Caching the results will speed up development,
and the servers will appreciate us for not spamming requests.
The responses are cached in the operating systems tempfile directory.
That's probably /tmp/ or /var/tmp/ on Unix flavors and C:/temp/ on Windows.
The filename is based on the URL. But since the URL might contain
characters that are invalid for filenames, we base64 encode the URL.
"""
safe_filename = base64.urlsafe_b64encode(bytes(url, "utf-8")).decode("utf-8")
filepath = os.path.join(gettempdir(), safe_filename)
if os.path.isfile(filepath) and os.environ.get("USE_CACHE", False):
with open(filepath, "r") as f:
return f.read()
else:
# We are relying on the outside world when we make a request, so we
# might want to wrap this in a try/except. But we'd
# only want to do that in two cases.
#
# 1. We have a way of handling exceptions,
# A good example would be to catch exceptions and retry the
# request; maybe the network was down.
#
# 2. We can't handle the exception, but we want to log something
# more useful than the stack trace that will get spit out if
# we just let the exception go uncaught.
#
# In this case, I don't think it's worth muddying up the code
# trying to handle exceptions here. It's easy enough to just re-run
# the script.
html = requests.get(url).text
if os.environ.get("USE_CACHE", False):
with open(filepath, "w+") as f:
f.write(html)
return html
def find_game_names(html):
"""
Game names can be found on the index page

@ -0,0 +1,46 @@
import base64
import os
import requests
from tempfile import gettempdir
def fetch_html(url):
"""
Helper to fetch and cache html responses.
During development and while testing, we'll be hitting the same urls often.
The content of the pages probably won't be changing.
Caching the results will speed up development,
and the servers will appreciate us for not spamming requests.
The responses are cached in the operating systems tempfile directory.
That's probably /tmp/ or /var/tmp/ on Unix flavors and C:/temp/ on Windows.
The filename is based on the URL. But since the URL might contain
characters that are invalid for filenames, we base64 encode the URL.
"""
safe_filename = base64.urlsafe_b64encode(bytes(url, "utf-8")).decode("utf-8")
filepath = os.path.join(gettempdir(), safe_filename)
if os.path.isfile(filepath) and os.environ.get("USE_CACHE", False):
with open(filepath, "r") as f:
return f.read()
else:
# We are relying on the outside world when we make a request, so we
# might want to wrap this in a try/except. But we'd
# only want to do that in two cases.
#
# 1. We have a way of handling exceptions,
# A good example would be to catch exceptions and retry the
# request; maybe the network was down.
#
# 2. We can't handle the exception, but we want to log something
# more useful than the stack trace that will get spit out if
# we just let the exception go uncaught.
#
# In this case, I don't think it's worth muddying up the code
# trying to handle exceptions here. It's easy enough to just re-run
# the script.
html = requests.get(url).text
if os.environ.get("USE_CACHE", False):
with open(filepath, "w+") as f:
f.write(html)
return html
Loading…
Cancel
Save