added louisiana.py
parent
8e11aae073
commit
9215af12a3
@ -0,0 +1,85 @@
|
|||||||
|
import sys
|
||||||
|
import traceback
|
||||||
|
from copy import deepcopy
|
||||||
|
import locale
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup as bs
|
||||||
|
import requests
|
||||||
|
import pandas as pd
|
||||||
|
from lottery_data_scraper.schemas import GameSchema
|
||||||
|
from lottery_data_scraper.util import fetch_html
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# It's worth assigning to constants values that are used in many
|
||||||
|
# places throughout a script.
|
||||||
|
BASE_URL = "http://www.louisianalottery.com"
|
||||||
|
INDEX_URL = "https://louisianalottery.com/scratch-offs/top-prizes-remaining"
|
||||||
|
|
||||||
|
def parse_index(html):
|
||||||
|
soup = bs(html, "lxml")
|
||||||
|
table = soup.find("table")
|
||||||
|
game_hrefs = table.select("tr > td > a")
|
||||||
|
game_urls = list(map(lambda x: "https:" + x.attrs["href"], game_hrefs))
|
||||||
|
return game_urls
|
||||||
|
|
||||||
|
|
||||||
|
def parse_game(url, html):
|
||||||
|
soup = bs(html, "lxml")
|
||||||
|
price = soup.select('div[id="scratch-off-prize-info"] td')[1].text.replace("$", "")
|
||||||
|
name = soup.find(class_="scratch-off-title").text
|
||||||
|
num = url.split("/")[-2]
|
||||||
|
grand_prize_row = soup.select(
|
||||||
|
'div[id="scratch-off-table-tier"] table > tbody > tr'
|
||||||
|
)[0]
|
||||||
|
grand_prize_odds = float(
|
||||||
|
grand_prize_row.select("td")[1].text.split(" in ")[1].replace(",", "",)
|
||||||
|
)
|
||||||
|
grand_prize_num = int(grand_prize_row.select("td")[2].text)
|
||||||
|
num_tx = int(grand_prize_odds * grand_prize_num)
|
||||||
|
table = soup.find_all("table")[2]
|
||||||
|
df = pd.read_html(str(table))[0]
|
||||||
|
df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "") # noqa: E231
|
||||||
|
df = df.replace("TICKET", price)
|
||||||
|
prizes = [
|
||||||
|
{
|
||||||
|
"prize": prize,
|
||||||
|
"value": float(prize.replace(",", "")),
|
||||||
|
"claimed": int(claimed),
|
||||||
|
"available": int(total) - int(claimed),
|
||||||
|
}
|
||||||
|
for prize, _, total, claimed in [list(r[1])[:4] for r in df.iterrows()]
|
||||||
|
]
|
||||||
|
game = {
|
||||||
|
"name": name,
|
||||||
|
"game_id": num,
|
||||||
|
"url": url,
|
||||||
|
"state": "la",
|
||||||
|
"price": float(price),
|
||||||
|
"num_tx_initial": num_tx,
|
||||||
|
"prizes": prizes,
|
||||||
|
}
|
||||||
|
return game
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
index_html = requests.get(INDEX_URL).text
|
||||||
|
game_urls = parse_index(index_html)
|
||||||
|
url_htmls = zip(game_urls, [requests.get(url).text for url in game_urls])
|
||||||
|
games = []
|
||||||
|
for url, html in url_htmls:
|
||||||
|
try:
|
||||||
|
game = parse_game(url, html)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warn("Unable to parse {}.\n{}".format(url, e))
|
||||||
|
continue
|
||||||
|
games.append(game)
|
||||||
|
return games
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
games = main()
|
||||||
|
schema = GameSchema(many=True)
|
||||||
|
print(schema.dumps(games))
|
Loading…
Reference in New Issue