From 37c40360c2dd8f47633cdea0331956caf9e57b15 Mon Sep 17 00:00:00 2001
From: tdhood <taylordannhood@gmail.com>
Date: Sat, 6 May 2023 14:50:54 -0700
Subject: [PATCH 1/4] adding connecticut parser and test

---
 lottery_data_scraper/connecticut.py | 127 ++++++++++++++++++++++++++++
 tests/test_connecticut.py           |  17 ++++
 2 files changed, 144 insertions(+)
 create mode 100644 lottery_data_scraper/connecticut.py
 create mode 100644 tests/test_connecticut.py

diff --git a/lottery_data_scraper/connecticut.py b/lottery_data_scraper/connecticut.py
new file mode 100644
index 0000000..cd4ba19
--- /dev/null
+++ b/lottery_data_scraper/connecticut.py
@@ -0,0 +1,127 @@
+import logging
+import os
+import re
+import sys
+import traceback
+from xmlrpc import client
+
+from bs4 import BeautifulSoup as bs
+import html2text
+import requests
+from lottery_data_scraper.schemas import GameSchema 
+from lottery_data_scraper.util import fetch_html
+
+logger = logging.getLogger(__name__)
+
+h = html2text.HTML2Text()
+h.ignore_links = True
+
+BASE = "https://www.ctlottery.org"
+
+INDEX = "https://ctlottery.org/ScratchGamesTable"
+
+
+headers = {
+    "X-Requested-With": "XMLHttpRequest",
+    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0",
+    "Referer": "https://www.ctlottery.org/ScratchGames",
+}
+
+
+
+def get_games_urls(url):
+    html = fetch_html(url)
+    soup = bs(html, "lxml")
+    table = soup.find("table")
+    game_hrefs = table.select("tr > td > a")
+    game_urls = list(map(lambda x: BASE + x.attrs["href"], game_hrefs))
+    return game_urls
+
+def parse_game(game_url):
+    # Each game page has two tables
+    #   Table 1: Ticket Price, Num_Tx_remaining, Odds
+    #   Table 2: Prize Table
+
+    game_html = fetch_html(game_url)
+    game_soup = bs(game_html, "lxml")
+
+    name = game_soup.find("h2").text
+    game_id = re.match(r"GAME #(\d*)",game_soup.find(class_="heading-sub-info").text).group(1)
+
+    #soup for table 1
+    table_one = game_soup.find(class_="img-detail-block")
+
+    price = int(re.search(r"Ticket Price:\$(\d*)", table_one.text).group(1))
+    
+    num_tx_str = re.search(r"Total # of Tickets:([\d*][,\d*]+)", table_one.text).group(1)
+    num_tx_initial = int(num_tx_str.replace(",", ""))
+
+
+    #soup for table 2
+    table_two = game_soup.find(class_="unclaimed-prize-wrap")
+    prize_rows = (
+     table_two.find("tbody").find_all("tr")
+    )
+    prizes = []
+    for row in prize_rows:
+        prize, total, available = [r.text for r in row.find_all("td")]
+        total = int(total.replace(",", ""))
+        available = int(available.replace(",", ""))
+        # one-off handlers...
+        if re.search(r"(?i)month.*for.*life", prize):
+            value = re.search(r"[\d,]+", prize).group()
+            value = float(value.replace(",", "")) * 20 * 12
+        elif re.search(r"(?i)$\d+ million", prize):
+            value = float(re.search(r"\d+").group()) * 1000000
+        else:
+            value = re.search(r"[\d,]+", prize).group()
+            value = float(value.replace("$", "").replace(",", ""))
+        prizes.append(
+            {
+                "prize": prize,
+                "value": value,
+                "claimed": total - available,
+                "available": available,
+            }
+        )
+
+    how_to_play_soup = game_soup.find(class_="play-text-wrap")
+    #remove heading and button tags
+    how_to_play_soup.h3.extract()
+    how_to_play_soup.a.extract()
+
+    how_to_play = h.handle(how_to_play_soup.text)
+
+    image_urls = BASE + game_soup.find(id="ticket_image").attrs["src"]
+
+    game = {
+        "state": "ct",
+        "game_id": game_id,
+        "name": name,
+        "price": price,
+        # Individual games are JavaScript links
+        "url": game_url,
+        "prizes": prizes,
+        "num_tx_initial": num_tx_initial,
+        "how_to_play": how_to_play,
+        "image_urls": image_urls
+    }
+    return game
+
+def main():
+    games_urls = get_games_urls(INDEX)
+    games = []
+    for game in games_urls:
+        try:
+            game = parse_game(game)
+        except Exception as e:
+            logger.error("Unable to parse game {}.\n{}".format(game, e))
+    games.append(game)
+    return games
+
+
+if __name__ == "__main__":
+    games = main()
+    schema = GameSchema(many=True)
+    print(schema.dumps(games))
+
diff --git a/tests/test_connecticut.py b/tests/test_connecticut.py
new file mode 100644
index 0000000..f96af33
--- /dev/null
+++ b/tests/test_connecticut.py
@@ -0,0 +1,17 @@
+import unittest
+import requests
+
+from lottery_data_scraper import connecticut
+from lottery_data_scraper import schemas
+
+class TestConnecticut(unittest.TestCase):
+    def test_parse_game_html(self):
+        # URL chosen arbitrarily
+        url = 'https://www.ctlottery.org/ScratchGames/1740/'
+        game = connecticut.parse_game(url)
+        self.assertEqual(game['name'], 'Extreme Green')
+        self.assertEqual(game["price"], 10)
+        self.assertEqual(game["game_id"], "1740")
+        self.assertEqual(game["prizes"][0]["prize"], "$100,000")
+        self.assertEqual(game["prizes"][0]["value"], 100000)
+        self.assertEqual(game["num_tx_initial"], 2230800)
\ No newline at end of file

From bb1014f5416599f12ce68e0c3e7b31d59fc043c1 Mon Sep 17 00:00:00 2001
From: tdhood <taylordannhood@gmail.com>
Date: Sat, 6 May 2023 15:55:45 -0700
Subject: [PATCH 2/4] fixed issue

---
 lottery_data_scraper/connecticut.py | 32 +++++++++++++++--------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/lottery_data_scraper/connecticut.py b/lottery_data_scraper/connecticut.py
index cd4ba19..35ad1a1 100644
--- a/lottery_data_scraper/connecticut.py
+++ b/lottery_data_scraper/connecticut.py
@@ -8,7 +8,7 @@ from xmlrpc import client
 from bs4 import BeautifulSoup as bs
 import html2text
 import requests
-from lottery_data_scraper.schemas import GameSchema 
+from lottery_data_scraper.schemas import GameSchema
 from lottery_data_scraper.util import fetch_html
 
 logger = logging.getLogger(__name__)
@@ -28,7 +28,6 @@ headers = {
 }
 
 
-
 def get_games_urls(url):
     html = fetch_html(url)
     soup = bs(html, "lxml")
@@ -37,6 +36,7 @@ def get_games_urls(url):
     game_urls = list(map(lambda x: BASE + x.attrs["href"], game_hrefs))
     return game_urls
 
+
 def parse_game(game_url):
     # Each game page has two tables
     #   Table 1: Ticket Price, Num_Tx_remaining, Odds
@@ -46,22 +46,23 @@ def parse_game(game_url):
     game_soup = bs(game_html, "lxml")
 
     name = game_soup.find("h2").text
-    game_id = re.match(r"GAME #(\d*)",game_soup.find(class_="heading-sub-info").text).group(1)
+    game_id = re.match(
+        r"GAME #(\d*)", game_soup.find(class_="heading-sub-info").text
+    ).group(1)
 
-    #soup for table 1
+    # soup for table 1
     table_one = game_soup.find(class_="img-detail-block")
 
     price = int(re.search(r"Ticket Price:\$(\d*)", table_one.text).group(1))
-    
-    num_tx_str = re.search(r"Total # of Tickets:([\d*][,\d*]+)", table_one.text).group(1)
-    num_tx_initial = int(num_tx_str.replace(",", ""))
 
+    num_tx_str = re.search(r"Total # of Tickets:([\d*][,\d*]+)", table_one.text).group(
+        1
+    )
+    num_tx_initial = int(num_tx_str.replace(",", ""))
 
-    #soup for table 2
+    # soup for table 2
     table_two = game_soup.find(class_="unclaimed-prize-wrap")
-    prize_rows = (
-     table_two.find("tbody").find_all("tr")
-    )
+    prize_rows = table_two.find("tbody").find_all("tr")
     prizes = []
     for row in prize_rows:
         prize, total, available = [r.text for r in row.find_all("td")]
@@ -86,7 +87,7 @@ def parse_game(game_url):
         )
 
     how_to_play_soup = game_soup.find(class_="play-text-wrap")
-    #remove heading and button tags
+    # remove heading and button tags
     how_to_play_soup.h3.extract()
     how_to_play_soup.a.extract()
 
@@ -104,10 +105,11 @@ def parse_game(game_url):
         "prizes": prizes,
         "num_tx_initial": num_tx_initial,
         "how_to_play": how_to_play,
-        "image_urls": image_urls
+        "image_urls": image_urls,
     }
     return game
 
+
 def main():
     games_urls = get_games_urls(INDEX)
     games = []
@@ -116,7 +118,8 @@ def main():
             game = parse_game(game)
         except Exception as e:
             logger.error("Unable to parse game {}.\n{}".format(game, e))
-    games.append(game)
+            continue
+        games.append(game)
     return games
 
 
@@ -124,4 +127,3 @@ if __name__ == "__main__":
     games = main()
     schema = GameSchema(many=True)
     print(schema.dumps(games))
-

From 57b83d096e082c3de3ae2f3c48e0af4421f4b425 Mon Sep 17 00:00:00 2001
From: tdhood <taylordannhood@gmail.com>
Date: Sat, 6 May 2023 16:18:14 -0700
Subject: [PATCH 3/4] removed unused imports

---
 lottery_data_scraper/connecticut.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/lottery_data_scraper/connecticut.py b/lottery_data_scraper/connecticut.py
index 35ad1a1..a049aaf 100644
--- a/lottery_data_scraper/connecticut.py
+++ b/lottery_data_scraper/connecticut.py
@@ -1,13 +1,8 @@
 import logging
-import os
 import re
-import sys
-import traceback
-from xmlrpc import client
 
 from bs4 import BeautifulSoup as bs
 import html2text
-import requests
 from lottery_data_scraper.schemas import GameSchema
 from lottery_data_scraper.util import fetch_html
 

From dd616da48aa0f740d28f21abab463d7507e14a44 Mon Sep 17 00:00:00 2001
From: tdhood <taylordannhood@gmail.com>
Date: Tue, 9 May 2023 13:16:23 -0700
Subject: [PATCH 4/4] returning image_urls as a list

---
 lottery_data_scraper/connecticut.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lottery_data_scraper/connecticut.py b/lottery_data_scraper/connecticut.py
index a049aaf..15c45d1 100644
--- a/lottery_data_scraper/connecticut.py
+++ b/lottery_data_scraper/connecticut.py
@@ -100,7 +100,7 @@ def parse_game(game_url):
         "prizes": prizes,
         "num_tx_initial": num_tx_initial,
         "how_to_play": how_to_play,
-        "image_urls": image_urls,
+        "image_urls": [image_urls],
     }
     return game