From 9e7a64d711b3201ddf14b3f36c8aef755352b2b5 Mon Sep 17 00:00:00 2001 From: anela Date: Fri, 21 Apr 2023 10:28:30 -0700 Subject: [PATCH] Initial commit --- lottery_data_scraper/maryland.py | 118 +++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 lottery_data_scraper/maryland.py diff --git a/lottery_data_scraper/maryland.py b/lottery_data_scraper/maryland.py new file mode 100644 index 0000000..53a420b --- /dev/null +++ b/lottery_data_scraper/maryland.py @@ -0,0 +1,118 @@ +import logging +import os +import re +from xmlrpc import client + +import html2text +import requests +from selenium import webdriver +from bs4 import BeautifulSoup as bs +from lotto_site_parsers.util import save_image + +logger = logging.getLogger(__name__) + +s = requests.Session() +h = html2text.HTML2Text() + +DB_REPO_URI = os.environ.get("DB_REPO_URI", "http://localhost:8989") +BASE_URL = "https://www.mdlottery.com" +BASE_INDEX_URL = "https://www.mdlottery.com/games/scratch-offs/" +HEADERS = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0", + "Host": "www.mdlottery.com", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "en-US,en;q=0.5", +} +INDEX_URL = "https://www.mdlottery.com/wp-admin/admin-ajax.php?action=jquery_shortcode&shortcode=scratch_offs" + + +def _name(game_div): + return game_div.find(class_="name").text + + +def _num(game_li): + return game_li.find(text="Game: ").next.text + + +def _price(game_li): + return int(game_li.find(class_="price").text.replace("$", "")) + + +def _odds(game_li): + odds = game_li.find(class_="probability").text + return float(odds) + + +def _num_tx(game_li): + return int(sum(p["available"] + p["claimed"] for p in _prizes(game_li)) * _odds(game_li)) + + +def _prizes(game_li): + table = game_li.find("table") + rows = table.find_all("tr")[1:] + prizes = [] + for row in rows: + cells = row.find_all("td") + prize = cells[0].text + value = float(re.sub(r"[\$,]", "", prize)) + available = int(cells[2].text) + claimed = int(cells[1].text) - available + prizes.append( + {"prize": prize, "value": value, "available": available, "claimed": claimed} + ) + return prizes + + +def _how_to_play(game_li): + return h.handle(str(game_li.find(class_="how-to-play"))) + + +def games(requests): + # Headless needed to run on server with no display + options = webdriver.firefox.options.Options() + options.headless = True + driver = webdriver.Firefox(options=options) + driver.get(INDEX_URL) + html = driver.page_source + soup = bs(html, "lxml") + game_lis = soup.find_all("li", class_="ticket") + games = [ + { + "name": _name(game_li), + "game_id": _num(game_li), + "url": BASE_INDEX_URL, + "how_to_play": _how_to_play(game_li), + "price": _price(game_li), + "state": "md", + "num_tx_initial": _num_tx(game_li), + "prizes": _prizes(game_li), + } + for game_li in game_lis + ] + return games + + +def fetch_games(): + result_games = [] + for game in games(s): + result_games.append(game) + return result_games + + +def save_game(game): + with client.ServerProxy(DB_REPO_URI) as c: + logger.debug("Saving game: {} - {}".format(game["game_id"], game["name"])) + c.persist([game]) + + +def main(): + logger.info("Saving games to {}".format(DB_REPO_URI)) + for game in fetch_games(): + save_game(game) + + +if __name__ == "__main__": + main()