From abf7c1f103742ff0534631179fce2c4b93a38e4e Mon Sep 17 00:00:00 2001
From: tdhood <taylordannhood@gmail.com>
Date: Thu, 20 Apr 2023 09:19:24 -0700
Subject: [PATCH 1/6] texas added

---
 lottery_data_scraper/texas.py | 140 ++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 lottery_data_scraper/texas.py

diff --git a/lottery_data_scraper/texas.py b/lottery_data_scraper/texas.py
new file mode 100644
index 0000000..41fe334
--- /dev/null
+++ b/lottery_data_scraper/texas.py
@@ -0,0 +1,140 @@
+"""
+Scrapes the Louisiana lottery website for scratch-off ticket
+data and calculates the expected value for each game.
+
+Louisiana publishes the number of tickets printed and how many
+tickets are printed at each prize level.
+
+We can calculated the expected value of a game by summing
+the value of all the prizes and dividing that by the cost
+of all the tickets.
+
+The texas lottery website has an "top prizes remaining" or an "index" page that 
+has links to every game that could still be profitable.
+Each individual game has a section for the "game rules" page and a prize table.
+We can use each individual game page to gather the important data, and 
+then run our calculations.
+
+Website that we'll be scraping:
+http://www.txlottery.org/export/sites/lottery/Games/Scratch_Offs/all.html
+
+Example usage:
+    python -m texas
+Or:
+    LOGLEVEL=DEBUG USE_CACHE=True python -m texas
+
+The following behavior is configurable through shell environment variables.
+
+Set LOGLEVEL to print useful debug info to console.
+LOGLEVEL=[DEBUG,INFO,WARNING,ERROR,CRITICAL]
+Defaults to WARNING.
+
+Set USE_CACHE to cache responses. This speeds up development
+and is nice to the servers we're hitting.
+USE_CACHE=[True]
+Defaults to False. Note: Setting this env variable to the string False
+will cause it to use cache because the string "False" evaluates to Truthy.
+Either set it to True or don't set it.
+"""
+
+import logging
+import os
+import re
+from xmlrpc import client
+
+from bs4 import BeautifulSoup as bs
+import pandas as pd
+import requests
+from lottery_data_scraper.schemas import GameSchema 
+from lottery_data_scraper.util import fetch_html
+
+logger = logging.getLogger(__name__)
+
+BASE_URL = "http://www.txlottery.org"
+INDEX_URL = (
+    "http://www.txlottery.org/export/sites/lottery/Games/Scratch_Offs/all.html"
+)
+
+
+def parse_index(html):
+    soup = bs(html, "lxml")
+    table = soup.find("table")
+    game_hrefs = table.select("tr > td > a")
+    game_urls = list(map(lambda x: BASE_URL + x.attrs["href"], game_hrefs))
+    return game_urls
+
+
+def parse_game(url, html):
+    soup = bs(html, "lxml")
+    price = int(
+        re.match(
+            r"\$(\d+)",
+            soup.select("h3 > img")[0].attrs["alt"]
+        ).group(1)
+    )
+    game_details = soup.select(".large-4.cell > h3")[0].parent.text.strip()
+    title = soup.select(".large-12.cell > .text-center > h2")[0].text.split(" - ")
+    name = title[1]
+    num = title[0][-4:]
+    num_tx = int(
+        re.match(
+            r".*?([\d,]+)",
+            soup.find(string=re.compile(r"There are approximately [\d,]+.*")).strip()
+        ).group(1).replace(",", "")
+    )
+    # Prizes
+    table = soup.find("table")
+    df = pd.read_html(str(table))[0]
+    df = df.replace("---", 0)
+    df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "", regex=False)  # noqa: E231
+    prizes = []
+    for prize, total, claimed in [list(r[1]) for r in df.iterrows()]:
+        match = re.match(r"\$?([\d,]+).*wk.*", prize)
+        if match:
+            value = float(match.group(1).replace(",", "")) * 20 * 52
+            prize = match.group(0)
+        else:
+            value = float(prize.replace(",", ""))
+            prize = "$" + prize
+        prizes.append(
+            {
+                "prize": prize,
+                "value": value,
+                "claimed": int(claimed),
+                "available": int(total) - int(claimed),
+            }
+        )
+    game = {
+        "name": name,
+        "game_id": num,
+        "url": url,
+        "price": price,
+        "state": "tx",
+        "num_tx_initial": num_tx,
+        "prizes": prizes,
+    }
+    return game
+
+
+def _parse_game(url, html):
+    try:
+        return parse_game(url, html)
+    except Exception as e:
+        logger.warning("Unable to parse {}.\n{}".format(url, e))
+    return None
+
+
+def main():
+    index_html = fetch_html(INDEX_URL)
+    game_urls = parse_index(index_html)
+    url_htmls = zip(game_urls, [fetch_html(url) for url in game_urls])
+    games = [_parse_game(url, html) for url, html in url_htmls]
+    games = [game for game in games if game is not None]
+    return games
+
+
+
+if __name__ == "__main__":
+    games = main()
+    schema = GameSchema(many=True)
+    print(schema.dumps(games))
\ No newline at end of file

From 879d08ffaf07aa1d4705178060c8da6d24d45dd9 Mon Sep 17 00:00:00 2001
From: tdhood <taylordannhood@gmail.com>
Date: Thu, 20 Apr 2023 09:21:34 -0700
Subject: [PATCH 2/6] cleaned up

---
 lottery_data_scraper/texas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lottery_data_scraper/texas.py b/lottery_data_scraper/texas.py
index 41fe334..641f1ad 100644
--- a/lottery_data_scraper/texas.py
+++ b/lottery_data_scraper/texas.py
@@ -86,7 +86,7 @@ def parse_game(url, html):
     table = soup.find("table")
     df = pd.read_html(str(table))[0]
     df = df.replace("---", 0)
-    df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "", regex=False)  # noqa: E231
+    df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "")  # noqa: E231
     prizes = []
     for prize, total, claimed in [list(r[1]) for r in df.iterrows()]:
         match = re.match(r"\$?([\d,]+).*wk.*", prize)

From ebe3e6e174fc66e8ef46569535fa659ff8312589 Mon Sep 17 00:00:00 2001
From: tdhood <taylordannhood@gmail.com>
Date: Thu, 20 Apr 2023 09:22:14 -0700
Subject: [PATCH 3/6] cleaned up

---
 lottery_data_scraper/texas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lottery_data_scraper/texas.py b/lottery_data_scraper/texas.py
index 641f1ad..6b91ca9 100644
--- a/lottery_data_scraper/texas.py
+++ b/lottery_data_scraper/texas.py
@@ -86,7 +86,7 @@ def parse_game(url, html):
     table = soup.find("table")
     df = pd.read_html(str(table))[0]
     df = df.replace("---", 0)
-    df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "")  # noqa: E231
+    df.iloc[:, 0] = df.iloc[:, 0].str.replace("$", "") 
     prizes = []
     for prize, total, claimed in [list(r[1]) for r in df.iterrows()]:
         match = re.match(r"\$?([\d,]+).*wk.*", prize)

From 2ef19fe207464317993a259121fd88e19c020255 Mon Sep 17 00:00:00 2001
From: Eric Ihli <eihli@owoga.com>
Date: Thu, 20 Apr 2023 22:35:00 -0500
Subject: [PATCH 4/6] Update references of Louisiana to be Texas

---
 lottery_data_scraper/texas.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lottery_data_scraper/texas.py b/lottery_data_scraper/texas.py
index 6b91ca9..388ca97 100644
--- a/lottery_data_scraper/texas.py
+++ b/lottery_data_scraper/texas.py
@@ -1,8 +1,8 @@
 """
-Scrapes the Louisiana lottery website for scratch-off ticket
+Scrapes the Texas lottery website for scratch-off ticket
 data and calculates the expected value for each game.
 
-Louisiana publishes the number of tickets printed and how many
+Texas publishes the number of tickets printed and how many
 tickets are printed at each prize level.
 
 We can calculated the expected value of a game by summing
@@ -137,4 +137,4 @@ def main():
 if __name__ == "__main__":
     games = main()
     schema = GameSchema(many=True)
-    print(schema.dumps(games))
\ No newline at end of file
+    print(schema.dumps(games))

From 765b0039f7c82d16bdd8a62458e431945458a338 Mon Sep 17 00:00:00 2001
From: tdhood <taylordannhood@gmail.com>
Date: Thu, 20 Apr 2023 20:44:59 -0700
Subject: [PATCH 5/6] added new line

---
 lottery_data_scraper/texas.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lottery_data_scraper/texas.py b/lottery_data_scraper/texas.py
index 6b91ca9..c4f4b6a 100644
--- a/lottery_data_scraper/texas.py
+++ b/lottery_data_scraper/texas.py
@@ -137,4 +137,5 @@ def main():
 if __name__ == "__main__":
     games = main()
     schema = GameSchema(many=True)
-    print(schema.dumps(games))
\ No newline at end of file
+    print(schema.dumps(games))
+    
\ No newline at end of file

From 67c1e628607d0dfbcf472da5e830f4155b1d317b Mon Sep 17 00:00:00 2001
From: tdhood <taylordannhood@gmail.com>
Date: Thu, 20 Apr 2023 20:47:40 -0700
Subject: [PATCH 6/6] added new line

---
 lottery_data_scraper/texas.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lottery_data_scraper/texas.py b/lottery_data_scraper/texas.py
index c4f4b6a..61b2fa3 100644
--- a/lottery_data_scraper/texas.py
+++ b/lottery_data_scraper/texas.py
@@ -133,9 +133,7 @@ def main():
     return games
 
 
-
 if __name__ == "__main__":
     games = main()
     schema = GameSchema(many=True)
     print(schema.dumps(games))
-    
\ No newline at end of file