fixed issue

main
Taylor Hood 2 years ago
parent 37c40360c2
commit bb1014f541

@ -8,7 +8,7 @@ from xmlrpc import client
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
import html2text import html2text
import requests import requests
from lottery_data_scraper.schemas import GameSchema from lottery_data_scraper.schemas import GameSchema
from lottery_data_scraper.util import fetch_html from lottery_data_scraper.util import fetch_html
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -28,7 +28,6 @@ headers = {
} }
def get_games_urls(url): def get_games_urls(url):
html = fetch_html(url) html = fetch_html(url)
soup = bs(html, "lxml") soup = bs(html, "lxml")
@ -37,6 +36,7 @@ def get_games_urls(url):
game_urls = list(map(lambda x: BASE + x.attrs["href"], game_hrefs)) game_urls = list(map(lambda x: BASE + x.attrs["href"], game_hrefs))
return game_urls return game_urls
def parse_game(game_url): def parse_game(game_url):
# Each game page has two tables # Each game page has two tables
# Table 1: Ticket Price, Num_Tx_remaining, Odds # Table 1: Ticket Price, Num_Tx_remaining, Odds
@ -46,22 +46,23 @@ def parse_game(game_url):
game_soup = bs(game_html, "lxml") game_soup = bs(game_html, "lxml")
name = game_soup.find("h2").text name = game_soup.find("h2").text
game_id = re.match(r"GAME #(\d*)",game_soup.find(class_="heading-sub-info").text).group(1) game_id = re.match(
r"GAME #(\d*)", game_soup.find(class_="heading-sub-info").text
).group(1)
#soup for table 1 # soup for table 1
table_one = game_soup.find(class_="img-detail-block") table_one = game_soup.find(class_="img-detail-block")
price = int(re.search(r"Ticket Price:\$(\d*)", table_one.text).group(1)) price = int(re.search(r"Ticket Price:\$(\d*)", table_one.text).group(1))
num_tx_str = re.search(r"Total # of Tickets:([\d*][,\d*]+)", table_one.text).group(1)
num_tx_initial = int(num_tx_str.replace(",", ""))
num_tx_str = re.search(r"Total # of Tickets:([\d*][,\d*]+)", table_one.text).group(
1
)
num_tx_initial = int(num_tx_str.replace(",", ""))
#soup for table 2 # soup for table 2
table_two = game_soup.find(class_="unclaimed-prize-wrap") table_two = game_soup.find(class_="unclaimed-prize-wrap")
prize_rows = ( prize_rows = table_two.find("tbody").find_all("tr")
table_two.find("tbody").find_all("tr")
)
prizes = [] prizes = []
for row in prize_rows: for row in prize_rows:
prize, total, available = [r.text for r in row.find_all("td")] prize, total, available = [r.text for r in row.find_all("td")]
@ -86,7 +87,7 @@ def parse_game(game_url):
) )
how_to_play_soup = game_soup.find(class_="play-text-wrap") how_to_play_soup = game_soup.find(class_="play-text-wrap")
#remove heading and button tags # remove heading and button tags
how_to_play_soup.h3.extract() how_to_play_soup.h3.extract()
how_to_play_soup.a.extract() how_to_play_soup.a.extract()
@ -104,10 +105,11 @@ def parse_game(game_url):
"prizes": prizes, "prizes": prizes,
"num_tx_initial": num_tx_initial, "num_tx_initial": num_tx_initial,
"how_to_play": how_to_play, "how_to_play": how_to_play,
"image_urls": image_urls "image_urls": image_urls,
} }
return game return game
def main(): def main():
games_urls = get_games_urls(INDEX) games_urls = get_games_urls(INDEX)
games = [] games = []
@ -116,7 +118,8 @@ def main():
game = parse_game(game) game = parse_game(game)
except Exception as e: except Exception as e:
logger.error("Unable to parse game {}.\n{}".format(game, e)) logger.error("Unable to parse game {}.\n{}".format(game, e))
games.append(game) continue
games.append(game)
return games return games
@ -124,4 +127,3 @@ if __name__ == "__main__":
games = main() games = main()
schema = GameSchema(many=True) schema = GameSchema(many=True)
print(schema.dumps(games)) print(schema.dumps(games))

Loading…
Cancel
Save