Advertisement
Guest User

Untitled

a guest
Mar 30th, 2020
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.35 KB | None | 0 0
  1. import pandas as pd
  2. import re
  3. from urllib.request import urlopen
  4. from urllib.error import URLError
  5. from bs4 import BeautifulSoup
  6. import sys  # For exiting safely
  7. import sqlalchemy
  8. import mysql.connector
  9. from sqlalchemy.types import TEXT
  10. import math
  11. from random import random
  12. import time
  13. import subprocess
  14. import datetime
  15. import numpy as np
  16. import requests
  17. from itertools import permutations
  18. from itertools import product
  19. import re
  20.  
  21. # from sqlalchemy import create_engine
  22.  
  23. pd.set_option('display.max_rows', None)
  24. pd.set_option('display.max_columns', None)
  25. pd.set_option('display.width', None)
  26. pd.set_option('display.max_colwidth', None)
  27.  
  28. #for use in terminal
  29. # cnx = sqlalchemy.create_engine("mysql://doadmin:oky0rg65nrwh1128@"
  30. #                                   "baseball-db-cluster-do-user-6778142-0.db.ondigitalocean.com:"
  31. #                                   "25060/pg?ssl=true&charset=utf8")
  32.  
  33. #for use in PyCharm
  34. cnx = mysql.connector.connect(user='doadmin', password='oky0rg65nrwh1128',
  35.                                host='baseball-db-cluster-do-user-6778142-0.db.ondigitalocean.com',
  36.                                database='pg', port="25060")
  37.  
  38. get_players = 'SELECT * FROM pg.pg_to_maxprep_matches LIMIT 10'
  39.  
  40. starting_players_pd = pd.read_sql_query(
  41.         get_players,
  42.         con=cnx)
  43.  
  44. starting_players_pd['url_to_use'] = starting_players_pd['url'].str[:-11] + 'baseball/stats.htm'
  45. #, len(starting_players_pd['url_to_use'])
  46. for i in range (0,9):
  47.     r = requests.get(starting_players_pd['url_to_use'][i])
  48.     print(starting_players_pd['url_to_use'][i])
  49.     soup = BeautifulSoup(r.text, 'html.parser')
  50.  
  51.     #to get graduation year
  52.     try:
  53.         grad_year_top_left = soup.select_one('.graduation-year').text
  54.     except:
  55.         grad_year_top_left = "No graduation year listed"
  56.     #print(grad_year_top_left)
  57.  
  58.     #to get class year listed
  59.     try:
  60.         class_grade = soup.select_one('.grade').text
  61.     except:
  62.         class_grade = "No class grade listed"
  63.     #print(class_grade)
  64.  
  65.  
  66.     #trying to get state and national rank and failing here
  67.     try:
  68.         state_rank = soup.findAll('span')
  69.         # for span in state_rank:
  70.         #     print(span.string)
  71.  
  72.     except:
  73.         state_rank = 'No state rank'
  74.     #print(state_rank)
  75.  
  76.  
  77.     #all of the stats in the top of the field are sorted by li
  78.     #but I can't figure out how to pull them each out individually
  79.     try:
  80.         featured_stats = soup.select_one('.featured-stats')
  81.         featured_stats = featured_stats.find_all('li')
  82.  
  83.     except:
  84.         featured_stats = 'No featured stats'
  85.  
  86.     #print(featured_stats)
  87.  
  88.     try:
  89.         all_stats = soup.find('div', class_="stats-grids")
  90.         #print(all_stats)
  91.         all_categories = all_stats.findAll('h3') #mx-grid sortable stats-grid
  92.         player_pd = pd.DataFrame()
  93.         for catgeory in all_categories:
  94.             suffix_to_be_added = catgeory.contents[0]
  95.             table_data = catgeory.find_parent('div')
  96.             pd_table = pd.read_html(str(table_data))
  97.             pd_table = (pd.DataFrame(pd_table))
  98.             #print(pd_table)
  99.             combo_stat = pd.DataFrame()
  100.             for i in range (0, (pd_table.shape[0])):
  101.                 sep_table = (pd.DataFrame((pd_table.iloc[i])).unstack()).iloc[0]
  102.                 sep_table.drop(sep_table.tail(1).index, inplace=True)
  103.                 sep_table.set_index('Year')
  104.                 sep_table.columns = [str(i) + '_' + suffix_to_be_added for i in (sep_table.columns.values)] #I need to figure out a way to make this only affect things that come after the first four
  105.                 combo_stat = combo_stat.append(sep_table, ignore_index=True)
  106.  
  107.             player_pd = player_pd.append(combo_stat)
  108.         print("Here are the stats for player_name ", starting_players_pd['name'][i])
  109.         print(player_pd)
  110.         print('\n')
  111.  
  112.  
  113.  
  114.  
  115.     except Exception as e:
  116.         first_year_stats = pd.DataFrame()
  117.         print(first_year_stats)
  118.  
  119.  
  120. #this is not working for some reason
  121.     # with cnx.begin() as conn:
  122.     #     updsql = sqlalchemy.sql.text(
  123.     #         "REPLACE INTO maxprep_data_zach_1 SET url_to_use = :url_to_use, player_name = :name"
  124.     #     )
  125.     #     conn.execute(updsql, url_to_use=str(starting_players_pd['url_to_use'][i]),
  126.     #                  name=str(starting_players_pd['name'][i])
  127.     #                  )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement