moved fetch_html to util file
parent
fdafaae267
commit
8e11aae073
@ -0,0 +1,46 @@
|
|||||||
|
import base64
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
from tempfile import gettempdir
|
||||||
|
|
||||||
|
def fetch_html(url):
|
||||||
|
"""
|
||||||
|
Helper to fetch and cache html responses.
|
||||||
|
|
||||||
|
During development and while testing, we'll be hitting the same urls often.
|
||||||
|
The content of the pages probably won't be changing.
|
||||||
|
Caching the results will speed up development,
|
||||||
|
and the servers will appreciate us for not spamming requests.
|
||||||
|
|
||||||
|
The responses are cached in the operating systems tempfile directory.
|
||||||
|
That's probably /tmp/ or /var/tmp/ on Unix flavors and C:/temp/ on Windows.
|
||||||
|
The filename is based on the URL. But since the URL might contain
|
||||||
|
characters that are invalid for filenames, we base64 encode the URL.
|
||||||
|
"""
|
||||||
|
safe_filename = base64.urlsafe_b64encode(bytes(url, "utf-8")).decode("utf-8")
|
||||||
|
filepath = os.path.join(gettempdir(), safe_filename)
|
||||||
|
|
||||||
|
if os.path.isfile(filepath) and os.environ.get("USE_CACHE", False):
|
||||||
|
with open(filepath, "r") as f:
|
||||||
|
return f.read()
|
||||||
|
else:
|
||||||
|
# We are relying on the outside world when we make a request, so we
|
||||||
|
# might want to wrap this in a try/except. But we'd
|
||||||
|
# only want to do that in two cases.
|
||||||
|
#
|
||||||
|
# 1. We have a way of handling exceptions,
|
||||||
|
# A good example would be to catch exceptions and retry the
|
||||||
|
# request; maybe the network was down.
|
||||||
|
#
|
||||||
|
# 2. We can't handle the exception, but we want to log something
|
||||||
|
# more useful than the stack trace that will get spit out if
|
||||||
|
# we just let the exception go uncaught.
|
||||||
|
#
|
||||||
|
# In this case, I don't think it's worth muddying up the code
|
||||||
|
# trying to handle exceptions here. It's easy enough to just re-run
|
||||||
|
# the script.
|
||||||
|
html = requests.get(url).text
|
||||||
|
if os.environ.get("USE_CACHE", False):
|
||||||
|
with open(filepath, "w+") as f:
|
||||||
|
f.write(html)
|
||||||
|
return html
|
Loading…
Reference in New Issue