From 9e7a64d711b3201ddf14b3f36c8aef755352b2b5 Mon Sep 17 00:00:00 2001 From: anela Date: Fri, 21 Apr 2023 10:28:30 -0700 Subject: [PATCH 1/7] Initial commit --- lottery_data_scraper/maryland.py | 118 +++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 lottery_data_scraper/maryland.py diff --git a/lottery_data_scraper/maryland.py b/lottery_data_scraper/maryland.py new file mode 100644 index 0000000..53a420b --- /dev/null +++ b/lottery_data_scraper/maryland.py @@ -0,0 +1,118 @@ +import logging +import os +import re +from xmlrpc import client + +import html2text +import requests +from selenium import webdriver +from bs4 import BeautifulSoup as bs +from lotto_site_parsers.util import save_image + +logger = logging.getLogger(__name__) + +s = requests.Session() +h = html2text.HTML2Text() + +DB_REPO_URI = os.environ.get("DB_REPO_URI", "http://localhost:8989") +BASE_URL = "https://www.mdlottery.com" +BASE_INDEX_URL = "https://www.mdlottery.com/games/scratch-offs/" +HEADERS = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0", + "Host": "www.mdlottery.com", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "en-US,en;q=0.5", +} +INDEX_URL = "https://www.mdlottery.com/wp-admin/admin-ajax.php?action=jquery_shortcode&shortcode=scratch_offs" + + +def _name(game_div): + return game_div.find(class_="name").text + + +def _num(game_li): + return game_li.find(text="Game: ").next.text + + +def _price(game_li): + return int(game_li.find(class_="price").text.replace("$", "")) + + +def _odds(game_li): + odds = game_li.find(class_="probability").text + return float(odds) + + +def _num_tx(game_li): + return int(sum(p["available"] + p["claimed"] for p in _prizes(game_li)) * _odds(game_li)) + + +def _prizes(game_li): + table = game_li.find("table") + rows = table.find_all("tr")[1:] + prizes = [] + for row in rows: + cells = row.find_all("td") + prize = cells[0].text + value = float(re.sub(r"[\$,]", "", prize)) + available = int(cells[2].text) + claimed = int(cells[1].text) - available + prizes.append( + {"prize": prize, "value": value, "available": available, "claimed": claimed} + ) + return prizes + + +def _how_to_play(game_li): + return h.handle(str(game_li.find(class_="how-to-play"))) + + +def games(requests): + # Headless needed to run on server with no display + options = webdriver.firefox.options.Options() + options.headless = True + driver = webdriver.Firefox(options=options) + driver.get(INDEX_URL) + html = driver.page_source + soup = bs(html, "lxml") + game_lis = soup.find_all("li", class_="ticket") + games = [ + { + "name": _name(game_li), + "game_id": _num(game_li), + "url": BASE_INDEX_URL, + "how_to_play": _how_to_play(game_li), + "price": _price(game_li), + "state": "md", + "num_tx_initial": _num_tx(game_li), + "prizes": _prizes(game_li), + } + for game_li in game_lis + ] + return games + + +def fetch_games(): + result_games = [] + for game in games(s): + result_games.append(game) + return result_games + + +def save_game(game): + with client.ServerProxy(DB_REPO_URI) as c: + logger.debug("Saving game: {} - {}".format(game["game_id"], game["name"])) + c.persist([game]) + + +def main(): + logger.info("Saving games to {}".format(DB_REPO_URI)) + for game in fetch_games(): + save_game(game) + + +if __name__ == "__main__": + main() From ad73070e6b10db6b84d8ced020ca164c215dfa61 Mon Sep 17 00:00:00 2001 From: anela Date: Fri, 21 Apr 2023 10:41:22 -0700 Subject: [PATCH 2/7] Debugging Maryland --- lottery_data_scraper/maryland.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/lottery_data_scraper/maryland.py b/lottery_data_scraper/maryland.py index 53a420b..5d49778 100644 --- a/lottery_data_scraper/maryland.py +++ b/lottery_data_scraper/maryland.py @@ -7,7 +7,6 @@ import html2text import requests from selenium import webdriver from bs4 import BeautifulSoup as bs -from lotto_site_parsers.util import save_image logger = logging.getLogger(__name__) @@ -97,21 +96,23 @@ def games(requests): def fetch_games(): result_games = [] - for game in games(s): + for game in games(s)[:2]: + print("fetch_games!", game) result_games.append(game) return result_games -def save_game(game): - with client.ServerProxy(DB_REPO_URI) as c: - logger.debug("Saving game: {} - {}".format(game["game_id"], game["name"])) - c.persist([game]) +# def save_game(game): +# with client.ServerProxy(DB_REPO_URI) as c: +# logger.debug("Saving game: {} - {}".format(game["game_id"], game["name"])) +# c.persist([game]) def main(): - logger.info("Saving games to {}".format(DB_REPO_URI)) + print('inside main') + # logger.info("Saving games to {}".format(DB_REPO_URI)) for game in fetch_games(): - save_game(game) + print("main!", game) if __name__ == "__main__": From 164d4bec0e9e28839f128e4814dcd209ff37ba09 Mon Sep 17 00:00:00 2001 From: anela Date: Fri, 21 Apr 2023 10:43:43 -0700 Subject: [PATCH 3/7] Initial commit --- new_mexico.py | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 new_mexico.py diff --git a/new_mexico.py b/new_mexico.py new file mode 100644 index 0000000..1313be5 --- /dev/null +++ b/new_mexico.py @@ -0,0 +1,113 @@ +import logging +import os +import re +from xmlrpc import client +import traceback + +from bs4 import BeautifulSoup as bs +import requests + + +from lotto_site_parsers.util import save_image +from lotto_site_parsers.util import save_game + +logger = logging.getLogger(__name__) + +DB_REPO_URI = os.environ.get("DB_REPO_URI", "http://localhost:8989") +BASE_URL = "https://www.nmlottery.com" +INDEX_URL = "https://www.nmlottery.com/games/scratchers" +HEADERS = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0", +} + + +def get_games(site_url): + """ + Takes the URL from the scratcher site + parses page for game ids and game info + returns and list of tuples with the id and game info for each game + """ + html = requests.get(site_url, headers=HEADERS).text + soup = bs(html, "html.parser") + + games_html = soup.find_all("div", class_="filter-block") + + ids = [ + re.search("\d+", id.text).group(0) + for id in soup.find_all("p", class_="game-number") + ] + + game_names = [name.text for name in soup.find_all("h3")] + + return list(zip(ids, game_names, games_html)) + + +def process_game(game_info): + """ + function takes game info: [game id, game_name, game_html_data] + + parses info to find specific game data + ex name, game_id, price, odds, prizes, how to play, image_url + + returns game object + """ + + game_html = game_info[2] + + name = game_info[1] + + game_id = game_info[0] + + price = float(game_html.find("p", class_="price").text.replace("$", "")) + + how_to_play = game_html.find("p", class_="how-to-play").find_next("span").text + + prizes = [ + { + "prize": row[0].strip(), + "value": price + if "prize ticket" in row[0].lower() + else float(row[0].replace("$", "").replace(",", "")), + "claimed": int(row[2].replace(",", "")) - int(row[3].replace(",", "")), + "available": int(row[3].replace(",", "")), + "total": int(row[2].replace(",", "")), + "odds": float(row[1].replace(",", "")), + } + for row in [ + row.text.split("\n")[1:-1] for row in game_html.table.find_all("tr")[1:] + ] + ] + + num_of_tix = int(prizes[0]["odds"] * prizes[0]["total"]) + + image_url = game_html.find("div", class_="scratcher-image").find_next("img")["src"] + image_location = save_image("nm", game_id, image_url, headers=HEADERS) + + game = { + "name": name, + "game_id": game_id, + "price": price, + "how_to_play": how_to_play, + "prizes": prizes, + "num_tx_initial": num_of_tix, + "state": "nm", + "image_urls": '["{}"]'.format(image_location), + } + + return game + + +def main(): + games = get_games(INDEX_URL) + for game in games: + try: + game = process_game(game) + save_game(game) + except Exception as e: + logger.warning(f"Unable to process game: {game[0]}-{game[1]}") + logger.warning(e) + traceback.print_exception(e) + + +if __name__ == "__main__": + main() From 1d4562f014dbee9fb24b2197233af2cae9bd3a55 Mon Sep 17 00:00:00 2001 From: anela Date: Fri, 21 Apr 2023 11:20:40 -0700 Subject: [PATCH 4/7] Debugging image_urls validation error with save_image util --- .../new_mexico.py | 24 ++++++----- lottery_data_scraper/util.py | 42 +++++++++++++++++++ 2 files changed, 55 insertions(+), 11 deletions(-) rename new_mexico.py => lottery_data_scraper/new_mexico.py (82%) diff --git a/new_mexico.py b/lottery_data_scraper/new_mexico.py similarity index 82% rename from new_mexico.py rename to lottery_data_scraper/new_mexico.py index 1313be5..924db38 100644 --- a/new_mexico.py +++ b/lottery_data_scraper/new_mexico.py @@ -5,15 +5,13 @@ from xmlrpc import client import traceback from bs4 import BeautifulSoup as bs -import requests +from lottery_data_scraper.schemas import GameSchema +from lottery_data_scraper.util import fetch_html +from lottery_data_scraper.util import save_image -from lotto_site_parsers.util import save_image -from lotto_site_parsers.util import save_game - logger = logging.getLogger(__name__) -DB_REPO_URI = os.environ.get("DB_REPO_URI", "http://localhost:8989") BASE_URL = "https://www.nmlottery.com" INDEX_URL = "https://www.nmlottery.com/games/scratchers" HEADERS = { @@ -27,7 +25,7 @@ def get_games(site_url): parses page for game ids and game info returns and list of tuples with the id and game info for each game """ - html = requests.get(site_url, headers=HEADERS).text + html = fetch_html(site_url) soup = bs(html, "html.parser") games_html = soup.find_all("div", class_="filter-block") @@ -81,7 +79,8 @@ def process_game(game_info): num_of_tix = int(prizes[0]["odds"] * prizes[0]["total"]) image_url = game_html.find("div", class_="scratcher-image").find_next("img")["src"] - image_location = save_image("nm", game_id, image_url, headers=HEADERS) + # FIXME: "image_urls" currently NoneType and not passing GameSchema + # image_location = save_image("nm", game_id, image_url, headers=HEADERS) game = { "name": name, @@ -91,23 +90,26 @@ def process_game(game_info): "prizes": prizes, "num_tx_initial": num_of_tix, "state": "nm", - "image_urls": '["{}"]'.format(image_location), + # "image_urls": '["{}"]'.format(image_url), + "image_urls": f'["{{image_url}}"]', } return game def main(): + final_games = [] games = get_games(INDEX_URL) for game in games: try: game = process_game(game) - save_game(game) + final_games.append(game) except Exception as e: logger.warning(f"Unable to process game: {game[0]}-{game[1]}") logger.warning(e) traceback.print_exception(e) - if __name__ == "__main__": - main() + games = main() + schema = GameSchema(many=True) + print(schema.dumps(games)) \ No newline at end of file diff --git a/lottery_data_scraper/util.py b/lottery_data_scraper/util.py index a58400e..cf7c4c1 100644 --- a/lottery_data_scraper/util.py +++ b/lottery_data_scraper/util.py @@ -1,8 +1,13 @@ +import logging import base64 import os +import re import requests from tempfile import gettempdir + +logger = logging.getLogger(__name__) + def fetch_html(url): """ Helper to fetch and cache html responses. @@ -44,3 +49,40 @@ def fetch_html(url): with open(filepath, "w+") as f: f.write(html) return html + + +def save_image(state, filename, url, headers=None): + """ + Takes an abbreviates for a state, filename(game_id), url of image location, and headers + + The function: + -parses the URL for the filetype + -establishes the image directory + -locates or create a filepath for images + -writes image info to file + """ + headers = headers or {} + extension = re.search(r"\.([^\.\?]*)($|[^\.]+$)", url).group(1) + IMAGE_DIR = os.getenv( + "IMAGE_DIR", + os.path.realpath(os.path.join(os.getenv("HOME"), ".data/assets/images")), + ) + IMAGE_DIR = f"{IMAGE_DIR}/{state}" + dirpath = IMAGE_DIR + if not os.path.exists(dirpath): + os.makedirs(dirpath) + filename = f"{filename}.{extension}" + filepath = os.path.realpath(os.path.join(dirpath, filename)) + try: + r = requests.get(url, stream=True, headers=headers) + except Exception as e: + logger.warn("Unable to download {}.\n{}".format(url, e)) + return None + if r.status_code == 200: + with open(filepath, "wb") as f: + for chunk in r: + f.write(chunk) + else: + logger.warn("Unable to download {}. {} - {}".format(url, r.status_code, r)) + return None + return "{}/{}".format(state, filename) From cf6e6d64064cb2f50f0de86631daf35acf9267bd Mon Sep 17 00:00:00 2001 From: anela Date: Fri, 21 Apr 2023 14:01:00 -0700 Subject: [PATCH 5/7] Removed save_image helper function --- lottery_data_scraper/util.py | 41 ------------------------------------ 1 file changed, 41 deletions(-) diff --git a/lottery_data_scraper/util.py b/lottery_data_scraper/util.py index cf7c4c1..1b05ab2 100644 --- a/lottery_data_scraper/util.py +++ b/lottery_data_scraper/util.py @@ -1,13 +1,9 @@ -import logging import base64 import os -import re import requests from tempfile import gettempdir -logger = logging.getLogger(__name__) - def fetch_html(url): """ Helper to fetch and cache html responses. @@ -49,40 +45,3 @@ def fetch_html(url): with open(filepath, "w+") as f: f.write(html) return html - - -def save_image(state, filename, url, headers=None): - """ - Takes an abbreviates for a state, filename(game_id), url of image location, and headers - - The function: - -parses the URL for the filetype - -establishes the image directory - -locates or create a filepath for images - -writes image info to file - """ - headers = headers or {} - extension = re.search(r"\.([^\.\?]*)($|[^\.]+$)", url).group(1) - IMAGE_DIR = os.getenv( - "IMAGE_DIR", - os.path.realpath(os.path.join(os.getenv("HOME"), ".data/assets/images")), - ) - IMAGE_DIR = f"{IMAGE_DIR}/{state}" - dirpath = IMAGE_DIR - if not os.path.exists(dirpath): - os.makedirs(dirpath) - filename = f"{filename}.{extension}" - filepath = os.path.realpath(os.path.join(dirpath, filename)) - try: - r = requests.get(url, stream=True, headers=headers) - except Exception as e: - logger.warn("Unable to download {}.\n{}".format(url, e)) - return None - if r.status_code == 200: - with open(filepath, "wb") as f: - for chunk in r: - f.write(chunk) - else: - logger.warn("Unable to download {}. {} - {}".format(url, r.status_code, r)) - return None - return "{}/{}".format(state, filename) From 98219f3ad50a5b60f2113a0a4fb038180f0234b0 Mon Sep 17 00:00:00 2001 From: anela Date: Fri, 21 Apr 2023 14:02:26 -0700 Subject: [PATCH 6/7] Scrapes data for New Mexico games Debugged type error. --- lottery_data_scraper/new_mexico.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/lottery_data_scraper/new_mexico.py b/lottery_data_scraper/new_mexico.py index 924db38..b7ce12e 100644 --- a/lottery_data_scraper/new_mexico.py +++ b/lottery_data_scraper/new_mexico.py @@ -7,7 +7,6 @@ import traceback from bs4 import BeautifulSoup as bs from lottery_data_scraper.schemas import GameSchema from lottery_data_scraper.util import fetch_html -from lottery_data_scraper.util import save_image logger = logging.getLogger(__name__) @@ -79,8 +78,6 @@ def process_game(game_info): num_of_tix = int(prizes[0]["odds"] * prizes[0]["total"]) image_url = game_html.find("div", class_="scratcher-image").find_next("img")["src"] - # FIXME: "image_urls" currently NoneType and not passing GameSchema - # image_location = save_image("nm", game_id, image_url, headers=HEADERS) game = { "name": name, @@ -90,8 +87,7 @@ def process_game(game_info): "prizes": prizes, "num_tx_initial": num_of_tix, "state": "nm", - # "image_urls": '["{}"]'.format(image_url), - "image_urls": f'["{{image_url}}"]', + "image_urls": f'["{image_url}"]', } return game @@ -108,8 +104,10 @@ def main(): logger.warning(f"Unable to process game: {game[0]}-{game[1]}") logger.warning(e) traceback.print_exception(e) + return final_games + if __name__ == "__main__": games = main() schema = GameSchema(many=True) - print(schema.dumps(games)) \ No newline at end of file + print(schema.dumps(games)) From a4f7cd940a1a136161741a2b91eb250eeda26e4d Mon Sep 17 00:00:00 2001 From: tdhood Date: Fri, 21 Apr 2023 14:11:09 -0700 Subject: [PATCH 7/7] removed maryland --- lottery_data_scraper/maryland.py | 119 ------------------------------- 1 file changed, 119 deletions(-) delete mode 100644 lottery_data_scraper/maryland.py diff --git a/lottery_data_scraper/maryland.py b/lottery_data_scraper/maryland.py deleted file mode 100644 index 5d49778..0000000 --- a/lottery_data_scraper/maryland.py +++ /dev/null @@ -1,119 +0,0 @@ -import logging -import os -import re -from xmlrpc import client - -import html2text -import requests -from selenium import webdriver -from bs4 import BeautifulSoup as bs - -logger = logging.getLogger(__name__) - -s = requests.Session() -h = html2text.HTML2Text() - -DB_REPO_URI = os.environ.get("DB_REPO_URI", "http://localhost:8989") -BASE_URL = "https://www.mdlottery.com" -BASE_INDEX_URL = "https://www.mdlottery.com/games/scratch-offs/" -HEADERS = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0", - "Host": "www.mdlottery.com", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Cache-Control": "no-cache", - "Connection": "keep-alive", - "Accept-Encoding": "gzip, deflate, br", - "Accept-Language": "en-US,en;q=0.5", -} -INDEX_URL = "https://www.mdlottery.com/wp-admin/admin-ajax.php?action=jquery_shortcode&shortcode=scratch_offs" - - -def _name(game_div): - return game_div.find(class_="name").text - - -def _num(game_li): - return game_li.find(text="Game: ").next.text - - -def _price(game_li): - return int(game_li.find(class_="price").text.replace("$", "")) - - -def _odds(game_li): - odds = game_li.find(class_="probability").text - return float(odds) - - -def _num_tx(game_li): - return int(sum(p["available"] + p["claimed"] for p in _prizes(game_li)) * _odds(game_li)) - - -def _prizes(game_li): - table = game_li.find("table") - rows = table.find_all("tr")[1:] - prizes = [] - for row in rows: - cells = row.find_all("td") - prize = cells[0].text - value = float(re.sub(r"[\$,]", "", prize)) - available = int(cells[2].text) - claimed = int(cells[1].text) - available - prizes.append( - {"prize": prize, "value": value, "available": available, "claimed": claimed} - ) - return prizes - - -def _how_to_play(game_li): - return h.handle(str(game_li.find(class_="how-to-play"))) - - -def games(requests): - # Headless needed to run on server with no display - options = webdriver.firefox.options.Options() - options.headless = True - driver = webdriver.Firefox(options=options) - driver.get(INDEX_URL) - html = driver.page_source - soup = bs(html, "lxml") - game_lis = soup.find_all("li", class_="ticket") - games = [ - { - "name": _name(game_li), - "game_id": _num(game_li), - "url": BASE_INDEX_URL, - "how_to_play": _how_to_play(game_li), - "price": _price(game_li), - "state": "md", - "num_tx_initial": _num_tx(game_li), - "prizes": _prizes(game_li), - } - for game_li in game_lis - ] - return games - - -def fetch_games(): - result_games = [] - for game in games(s)[:2]: - print("fetch_games!", game) - result_games.append(game) - return result_games - - -# def save_game(game): -# with client.ServerProxy(DB_REPO_URI) as c: -# logger.debug("Saving game: {} - {}".format(game["game_id"], game["name"])) -# c.persist([game]) - - -def main(): - print('inside main') - # logger.info("Saving games to {}".format(DB_REPO_URI)) - for game in fetch_games(): - print("main!", game) - - -if __name__ == "__main__": - main()