You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

85 lines
2.5 KiB
Python

import sys
import traceback
from copy import deepcopy
import locale
import logging
import os
import re
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
from lottery_data_scraper.schemas import GameSchema
from lottery_data_scraper.util import fetch_html
logger = logging.getLogger(__name__)
# It's worth assigning to constants values that are used in many
# places throughout a script.
BASE_URL = "http://www.louisianalottery.com"
INDEX_URL = "https://louisianalottery.com/scratch-offs/top-prizes-remaining"
def parse_index(html):
soup = bs(html, "lxml")
table = soup.find("table")
game_hrefs = table.select("tr > td > a")
game_urls = list(map(lambda x: "https:" + x.attrs["href"], game_hrefs))
return game_urls
def parse_game(url, html):
soup = bs(html, "lxml")
price = soup.select('div[id="scratch-off-prize-info"] td')[1].text.replace("$", "")
name = soup.find(class_="scratch-off-title").text
num = url.split("/")[-2]
grand_prize_row = soup.select(
'div[id="scratch-off-table-tier"] table > tbody > tr'
)[0]
grand_prize_odds = float(
grand_prize_row.select("td")[1].text.split(" in ")[1].replace(",", "",)
)
grand_prize_num = int(grand_prize_row.select("td")[2].text)
num_tx = int(grand_prize_odds * grand_prize_num)
table = soup.find_all("table")[2]
df = pd.read_html(str(table))[0]
df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "") # noqa: E231
df = df.replace("TICKET", price)
prizes = [
{
"prize": prize,
"value": float(prize.replace(",", "")),
"claimed": int(claimed),
"available": int(total) - int(claimed),
}
for prize, _, total, claimed in [list(r[1])[:4] for r in df.iterrows()]
]
game = {
"name": name,
"game_id": num,
"url": url,
"state": "la",
"price": float(price),
"num_tx_initial": num_tx,
"prizes": prizes,
}
return game
def main():
index_html = requests.get(INDEX_URL).text
game_urls = parse_index(index_html)
url_htmls = zip(game_urls, [requests.get(url).text for url in game_urls])
games = []
for url, html in url_htmls:
try:
game = parse_game(url, html)
except Exception as e:
logger.warn("Unable to parse {}.\n{}".format(url, e))
continue
games.append(game)
return games
if __name__ == "__main__":
games = main()
schema = GameSchema(many=True)
print(schema.dumps(games))