changes were made from pull request

added file descprition at top
3 years ago · 398fd05808
parent 9215af12a3
commit 398fd05808
1 changed files with 45 additions and 7 deletions
--- a/lottery_data_scraper/louisiana.py
+++ b/lottery_data_scraper/louisiana.py
@ -1,3 +1,42 @@
 """
 Scrapes the Louisiana lottery website for scratch-off ticket
 data and calculates the expected value for each game.
 Louisiana publishes the number of tickets printed and how many
 tickets are printed at each prize level.
 We can calculated the expected value of a game by summing
 the value of all the prizes and dividing that by the cost
 of all the tickets.
 The louisianalottery website has an "top prizes remaining" or an "index" page that 
 has links to every game that could still be profitable.
 Each individual game has a section for the "game rules" page and a prize table.
 We can use each individual game page to gather the important data, and 
 then run our calculations.
 Website that we'll be scraping:
 https://louisianalottery.com/scratch-offs/top-prizes-remaining
 Example usage:
    python -m louisiana
 Or:
    LOGLEVEL=DEBUG USE_CACHE=True python -m louisiana
 The following behavior is configurable through shell environment variables.
 Set LOGLEVEL to print useful debug info to console.
 LOGLEVEL=[DEBUG,INFO,WARNING,ERROR,CRITICAL]
 Defaults to WARNING.
 Set USE_CACHE to cache responses. This speeds up development
 and is nice to the servers we're hitting.
 USE_CACHE=[True]
 Defaults to False. Note: Setting this env variable to the string False
 will cause it to use cache because the string "False" evaluates to Truthy.
 Either set it to True or don't set it.
 """
 import sys
 import traceback
 from copy import deepcopy
@ -25,7 +64,7 @@ def parse_index(html):
    game_urls = list(map(lambda x: "https:" + x.attrs["href"], game_hrefs))
    return game_urls
-
+# TODO: convert pandas to beautiful soup
 def parse_game(url, html):
    soup = bs(html, "lxml")
    price = soup.select('div[id="scratch-off-prize-info"] td')[1].text.replace("$", "")
@ -41,12 +80,11 @@ def parse_game(url, html):
    num_tx = int(grand_prize_odds * grand_prize_num)
    table = soup.find_all("table")[2]
    df = pd.read_html(str(table))[0]
    df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "")  # noqa: E231
    df = df.replace("TICKET", price)
    prizes = [
        {
            "prize": prize,
-            "value": float(prize.replace(",", "")),
+            "value": float(prize.replace("$", "").replace(",", "")),
            "claimed": int(claimed),
            "available": int(total) - int(claimed),
        }
@ -65,15 +103,15 @@ def parse_game(url, html):
 def main():
-    index_html = requests.get(INDEX_URL).text
+    index_html = fetch_html(INDEX_URL)
    game_urls = parse_index(index_html)
-    url_htmls = zip(game_urls, [requests.get(url).text for url in game_urls])
+    url_htmls = zip(game_urls, [fetch_html(url) for url in game_urls])
    games = []
    for url, html in url_htmls:
        try:
            game = parse_game(url, html)
        except Exception as e:
-            logger.warn("Unable to parse {}.\n{}".format(url, e))
+            logger.error("Unable to parse {}.\n{}".format(url, e))
            continue
        games.append(game)
    return games
@ -82,4 +120,4 @@ def main():
 if __name__ == "__main__":
    games = main()
    schema = GameSchema(many=True)
-    print(schema.dumps(games))
+    print(schema.dumps(games))