Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import re
- from urllib.request import urlopen
- from urllib.error import URLError
- from bs4 import BeautifulSoup
- import sys # For exiting safely
- import sqlalchemy
- import mysql.connector
- from sqlalchemy.types import TEXT
- import math
- from random import random
- import time
- import subprocess
- import datetime
- import numpy as np
- import requests
- from itertools import permutations
- from itertools import product
- import re
- # from sqlalchemy import create_engine
- pd.set_option('display.max_rows', None)
- pd.set_option('display.max_columns', None)
- pd.set_option('display.width', None)
- pd.set_option('display.max_colwidth', None)
- #for use in terminal
- # cnx = sqlalchemy.create_engine("mysql://doadmin:oky0rg65nrwh1128@"
- # "baseball-db-cluster-do-user-6778142-0.db.ondigitalocean.com:"
- # "25060/pg?ssl=true&charset=utf8")
- #for use in PyCharm
- cnx = mysql.connector.connect(user='doadmin', password='oky0rg65nrwh1128',
- host='baseball-db-cluster-do-user-6778142-0.db.ondigitalocean.com',
- database='pg', port="25060")
- get_players = 'SELECT * FROM pg.pg_to_maxprep_matches LIMIT 10'
- starting_players_pd = pd.read_sql_query(
- get_players,
- con=cnx)
- starting_players_pd['url_to_use'] = starting_players_pd['url'].str[:-11] + 'baseball/stats.htm'
- #, len(starting_players_pd['url_to_use'])
- for i in range (0,9):
- r = requests.get(starting_players_pd['url_to_use'][i])
- print(starting_players_pd['url_to_use'][i])
- soup = BeautifulSoup(r.text, 'html.parser')
- #to get graduation year
- try:
- grad_year_top_left = soup.select_one('.graduation-year').text
- except:
- grad_year_top_left = "No graduation year listed"
- #print(grad_year_top_left)
- #to get class year listed
- try:
- class_grade = soup.select_one('.grade').text
- except:
- class_grade = "No class grade listed"
- #print(class_grade)
- #trying to get state and national rank and failing here
- try:
- state_rank = soup.findAll('span')
- # for span in state_rank:
- # print(span.string)
- except:
- state_rank = 'No state rank'
- #print(state_rank)
- #all of the stats in the top of the field are sorted by li
- #but I can't figure out how to pull them each out individually
- try:
- featured_stats = soup.select_one('.featured-stats')
- featured_stats = featured_stats.find_all('li')
- except:
- featured_stats = 'No featured stats'
- #print(featured_stats)
- try:
- all_stats = soup.find('div', class_="stats-grids")
- #print(all_stats)
- all_categories = all_stats.findAll('h3') #mx-grid sortable stats-grid
- player_pd = pd.DataFrame()
- for catgeory in all_categories:
- suffix_to_be_added = catgeory.contents[0]
- table_data = catgeory.find_parent('div')
- pd_table = pd.read_html(str(table_data))
- pd_table = (pd.DataFrame(pd_table))
- #print(pd_table)
- combo_stat = pd.DataFrame()
- for i in range (0, (pd_table.shape[0])):
- sep_table = (pd.DataFrame((pd_table.iloc[i])).unstack()).iloc[0]
- sep_table.drop(sep_table.tail(1).index, inplace=True)
- sep_table.set_index('Year')
- sep_table.columns = [str(i) + '_' + suffix_to_be_added for i in (sep_table.columns.values)] #I need to figure out a way to make this only affect things that come after the first four
- combo_stat = combo_stat.append(sep_table, ignore_index=True)
- player_pd = player_pd.append(combo_stat)
- print("Here are the stats for player_name ", starting_players_pd['name'][i])
- print(player_pd)
- print('\n')
- except Exception as e:
- first_year_stats = pd.DataFrame()
- print(first_year_stats)
- #this is not working for some reason
- # with cnx.begin() as conn:
- # updsql = sqlalchemy.sql.text(
- # "REPLACE INTO maxprep_data_zach_1 SET url_to_use = :url_to_use, player_name = :name"
- # )
- # conn.execute(updsql, url_to_use=str(starting_players_pd['url_to_use'][i]),
- # name=str(starting_players_pd['name'][i])
- # )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement