commit
29d9607bb6
@ -0,0 +1,113 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from xmlrpc import client
|
||||
import traceback
|
||||
|
||||
from bs4 import BeautifulSoup as bs
|
||||
from lottery_data_scraper.schemas import GameSchema
|
||||
from lottery_data_scraper.util import fetch_html
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BASE_URL = "https://www.nmlottery.com"
|
||||
INDEX_URL = "https://www.nmlottery.com/games/scratchers"
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0",
|
||||
}
|
||||
|
||||
|
||||
def get_games(site_url):
|
||||
"""
|
||||
Takes the URL from the scratcher site
|
||||
parses page for game ids and game info
|
||||
returns and list of tuples with the id and game info for each game
|
||||
"""
|
||||
html = fetch_html(site_url)
|
||||
soup = bs(html, "html.parser")
|
||||
|
||||
games_html = soup.find_all("div", class_="filter-block")
|
||||
|
||||
ids = [
|
||||
re.search("\d+", id.text).group(0)
|
||||
for id in soup.find_all("p", class_="game-number")
|
||||
]
|
||||
|
||||
game_names = [name.text for name in soup.find_all("h3")]
|
||||
|
||||
return list(zip(ids, game_names, games_html))
|
||||
|
||||
|
||||
def process_game(game_info):
|
||||
"""
|
||||
function takes game info: [game id, game_name, game_html_data]
|
||||
|
||||
parses info to find specific game data
|
||||
ex name, game_id, price, odds, prizes, how to play, image_url
|
||||
|
||||
returns game object
|
||||
"""
|
||||
|
||||
game_html = game_info[2]
|
||||
|
||||
name = game_info[1]
|
||||
|
||||
game_id = game_info[0]
|
||||
|
||||
price = float(game_html.find("p", class_="price").text.replace("$", ""))
|
||||
|
||||
how_to_play = game_html.find("p", class_="how-to-play").find_next("span").text
|
||||
|
||||
prizes = [
|
||||
{
|
||||
"prize": row[0].strip(),
|
||||
"value": price
|
||||
if "prize ticket" in row[0].lower()
|
||||
else float(row[0].replace("$", "").replace(",", "")),
|
||||
"claimed": int(row[2].replace(",", "")) - int(row[3].replace(",", "")),
|
||||
"available": int(row[3].replace(",", "")),
|
||||
"total": int(row[2].replace(",", "")),
|
||||
"odds": float(row[1].replace(",", "")),
|
||||
}
|
||||
for row in [
|
||||
row.text.split("\n")[1:-1] for row in game_html.table.find_all("tr")[1:]
|
||||
]
|
||||
]
|
||||
|
||||
num_of_tix = int(prizes[0]["odds"] * prizes[0]["total"])
|
||||
|
||||
image_url = game_html.find("div", class_="scratcher-image").find_next("img")["src"]
|
||||
|
||||
game = {
|
||||
"name": name,
|
||||
"game_id": game_id,
|
||||
"price": price,
|
||||
"how_to_play": how_to_play,
|
||||
"prizes": prizes,
|
||||
"num_tx_initial": num_of_tix,
|
||||
"state": "nm",
|
||||
"image_urls": f'["{image_url}"]',
|
||||
}
|
||||
|
||||
return game
|
||||
|
||||
|
||||
def main():
|
||||
final_games = []
|
||||
games = get_games(INDEX_URL)
|
||||
for game in games:
|
||||
try:
|
||||
game = process_game(game)
|
||||
final_games.append(game)
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to process game: {game[0]}-{game[1]}")
|
||||
logger.warning(e)
|
||||
traceback.print_exception(e)
|
||||
return final_games
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
games = main()
|
||||
schema = GameSchema(many=True)
|
||||
print(schema.dumps(games))
|
Loading…
Reference in New Issue