Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from urllib.request import urlopen
- from urllib.error import URLError
- from bs4 import BeautifulSoup
- import sys # For exiting safely
- import sqlalchemy
- import mysql.connector
- from sqlalchemy.types import TEXT
- import math
- from random import random
- import time
- import subprocess
- import datetime
- import numpy as np
- import requests
- from itertools import permutations
- from itertools import product
- from iteration_utilities import deepflatten
- import re
- import itertools
- from inspect import currentframe
- #a function to make debugging easier
- def get_linenumber():
- cf = currentframe()
- return str(cf.f_back.f_lineno)
- pd.set_option('display.max_columns', 500)
- pd.set_option('display.width', 1000)
- #this is the point in the code where we should import the PBR urls we want to use
- hs_and_college_example = 'https://www.prepbaseballreport.com/profiles/OH/Jakob-Brustoski-1304697258'
- only_hs_stat_example = 'https://www.prepbaseballreport.com/profiles/OH/Jakob-Brustoski-1304697258'
- #this function helps you log into PBR
- def pbr_login(sess):
- loginurl = 'https://www.prepbaseballreport.com/customer/account/login/'
- loginaction = 'https://www.prepbaseballreport.com/customer/account/loginPost/'
- # session = sess
- r1 = sess.get(loginurl)
- logsoup = BeautifulSoup(r1.text, 'html.parser')
- form_key = logsoup.select_one('#login-form input[name="form_key"]')['value']
- login_data = {'login[username]':'mohammed246@cmailing.com', 'login[password]': 'dinger2034', 'form_key': form_key}
- r = sess.post(loginaction, data=login_data, headers=dict(Referer=loginurl))
- # print(r.text)
- print('Login Success')
- return sess
- #this is a function to determine if the player played in highschool, college, or both
- #it doesn't seem like there are players who only play in college on PBR but as we try more cases maybe some will come up?
- def determine_levels_played(url, sess):
- r = sess.get(url, headers=dict(Referer=url))
- if r.status_code == 404:
- return None
- soup = BeautifulSoup(r.text, 'html.parser')
- soup_tester = soup.select_one('#college-profile')
- if soup_tester is None:
- print('this guy has no college')
- no_college(url=url, sess=sess, soup=soup)
- else:
- college(url=url, sess=sess, soup=soup)
- #this is for players with both a college and highschool profil
- def college(url, sess, soup):
- print('This is data for a player with both a hs and college profile')
- def get_rankings(url, sess, soup):
- rankings = soup.select('.player-rank')
- ranks = {}
- if len(rankings) > 0:
- for rank_n, rank in enumerate(rankings):
- ranklabel = rank.select_one('.rank-label').text.lower().strip().replace(' ', '_')
- for trsh in rank.select('span'):
- trsh.decompose()
- ranks[ranklabel + '_pos_rank'] = rank.select_one('.pos-rank').text.strip()
- ranks[ranklabel + '_pbr_rank'] = rank.select_one('.pbr-rank').text.strip()
- print('Here are the rankings: ', ranks)
- return(ranks)
- def get_info(url, sess, soup):
- info = soup.select('.info-section li > div,'
- ' .info-section li:not(:has(div), .stat-head, .stat-holder), .info-section li > span')
- infos = {}
- for dat_n, dat in enumerate(info):
- val = dat.select_one('strong').text.lower().strip()
- dat.select_one('strong').decompose()
- try:
- if "player-stat" in dat["class"]:
- if dat.select_one('span') is not None:
- date = dat.select_one('span').text.lower().strip().replace("(", "").replace(")", "")
- dat.select_one('span').decompose()
- else:
- date = ""
- except:
- pass
- label = dat.text.lower().strip().replace(' ', '_').replace(":", "")
- infos[label] = val
- try:
- if "player-stat" in dat["class"]:
- infos[label + "_date"] = date
- except:
- pass
- college_r = sess.get(str(url)+'#college-profile', headers=dict(Referer=url))
- college_soup = BeautifulSoup(college_r.text, 'html.parser')
- college_info = college_soup.select('.info-section li > div,'
- ' .info-section li:not(:has(div), .stat-head, .stat-holder), .info-section li > span')
- college_infos = {}
- for dat_n, dat in enumerate(college_info):
- val = dat.select_one('strong').text.lower().strip()
- dat.select_one('strong').decompose()
- try:
- if "player-stat" in dat["class"]:
- if dat.select_one('span') is not None:
- date = dat.select_one('span').text.lower().strip().replace("(", "").replace(")", "")
- dat.select_one('span').decompose()
- else:
- date = ""
- except:
- pass
- label = dat.text.lower().strip().replace(' ', '_').replace(":", "")
- college_infos[label] = val
- try:
- if "player-stat" in dat["class"]:
- college_infos[label + "_date"] = date
- except:
- pass
- print('hs infos', infos)
- print('college infos', college_infos)
- return(infos, college_infos)
- def get_comments(url, sess, soup):
- comments = soup.select('.player-comments .player-comment')
- reports = []
- reportdates = []
- reporttypes = []
- reporturls = []
- if len(comments) > 0:
- for rep_n, rep in enumerate(comments):
- try:
- url = 'no url listed at index ' + str(rep_n)
- url = rep.select_one('a[href]')
- url = re.findall('"([^"]*)"', str(url))
- url = 'https://www.prepbaseballreport.com' + str(url[0])
- except:
- url = 'no url listed at index ' + str(rep_n)
- try:
- date ='no date listed at index ' + str(rep_n)
- date = rep.select_one('.comment-date').text.strip()
- except:
- date = 'no date listed at index ' + str(rep_n)
- report = rep.select_one('.comment').text.strip()
- # label = "report_" + str(rep_n+1)
- reports.append(report)
- reportdates.append(date)
- reporttypes.append('hs')
- reporturls.append(url)
- comments_college_test = soup.select('#college_tab2 > #stats_panel')
- comments_college = soup.select('#college_tab2')
- if len(comments_college_test)> 0:
- for rep_n, rep in enumerate(comments_college):
- try:
- url = 'no url listed at index ' + str(rep_n)
- url = rep.select('a[href]')
- url = re.findall('"([^"]*)"', str(url))
- #PBR when it writes links sometimes only uses the things that following it's own prepbaseballreport.com so the follow helps grab the useful URL
- for i in range(len(url)):
- if (str(url[i])[:4] == 'http'):
- url[i] = url[i]
- elif (str(url[i][:4]) != 'http'):
- url[i] = 'https://www.prepbaseballreport.com' + str(url[i])
- except:
- url = "no url listed at index " + str(rep_n)
- try:
- dates = 'no date listed at index ' + str(rep_n)
- text_to_search = str(rep)
- pattern = '<strong>(.*?)</strong>'
- dates = re.findall(pattern, text_to_search)
- except:
- dates ='no date listed at index ' + str(rep_n)
- report = rep.select_one('.comment').text.strip()
- reports.append(report)
- reportdates.append(dates)
- reporttypes.append('college')
- reporturls.append(url)
- #this will make it easier to understanding the data later on, otherwise we have a list with a list added on to the end
- #the second list contains the college data, but it is hard to track them all together
- def flatten(L):
- for l in L:
- if isinstance(l, list):
- yield from flatten(l)
- else:
- yield l
- try:
- reports = list(flatten(reports))
- except:
- pass
- try:
- reportdates = list(flatten(reportdates))
- except:
- pass
- try:
- reporttypes = list(flatten(reporttypes))
- except:
- pass
- try:
- reporturls = list(flatten(reporturls))
- except:
- pass
- print('reports', reportdates)
- return(reports, reportdates, reporttypes, reporturls)
- def get_stats_panel(url, sess, soup):
- statspanels = soup.select('.headlines-list.news-list li:has(.stats-panel)')
- stats = {}
- if len(statspanels) > 0:
- for panel_n, panel in enumerate(statspanels):
- date = panel.select_one('.date').text
- type = panel.select_one('.title').text
- labels = [x.text.strip().lower().replace(' ', '_') for x in panel.select('thead td')]
- val = [x.text.strip().lower().replace(' ', '_') for x in panel.select('td')
- if x not in panel.select('thead td')]
- if len(labels) > 0:
- stats['historical_' + str(panel_n + 1) + "_date"] = date
- stats['historical_' + str(panel_n + 1) + "_type"] = type
- # print(panel)
- for label_n, label in enumerate(labels):
- stats['historical_' + str(panel_n + 1) + "_" + label] = val[label_n]
- college_statspanels = soup.select('#college_tab4')
- college_stats = {}
- if len(college_statspanels) > 0:
- try:
- for panel_n, panel in enumerate(college_statspanels):
- date = panel.select_one('.date').text
- type = panel.select_one('.title').text
- labels = [x.text.strip().lower().replace(' ', '_') for x in panel.select('thead td')]
- val = [x.text.strip().lower().replace(' ', '_') for x in panel.select('td')
- if x not in panel.select('thead td')]
- if len(labels) > 0:
- college_stats['historical_' + str(panel_n + 1) + "_date"] = date
- college_stats['historical_' + str(panel_n + 1) + "_type"] = type
- # print(panel)
- for label_n, label in enumerate(labels):
- college_stats['historical_' + str(panel_n + 1) + "_" + label] = val[label_n]
- except:
- college_stats = {}
- print('hs stats', stats)
- print('college stats', college_stats)
- return(stats, college_stats)
- def get_commitments(url, sess, soup):
- commitments = soup.select('.commitment-draft')
- commits = {}
- if len(commitments) > 0:
- for commit in commitments:
- type = commit.select_one('h2').text.strip().lower()
- try:
- val = commit.select_one('a').text.strip().lower()
- try:
- valid = 'https://www.prepbaseballreport.com' + str(commit.select_one('a')['href'].strip().lower())
- except:
- valid = ""
- commits[type] = val
- commits[type + "_link"] = valid
- except:
- val = commit.select_one('p').text.strip().lower()
- commits[type] = val
- if "round" in val:
- val = val.split(", ")
- commits["draftyear"] = val[0].strip()
- commits["draftround"] = val[1].replace("round", "").strip()
- commits["draftteam"] = val[2].strip()
- print('commits', commits)
- return commits
- ranks = get_rankings(url=url, sess=sess, soup=soup)
- hsinfos, collegeinfos = get_info(url=url, sess=sess, soup=soup)
- r, rd, rt, ru = get_comments(url=url, sess=sess, soup=soup)
- hsstats, collegestats = get_stats_panel(url=url, sess=sess, soup=soup)
- commits = get_commitments(url=url, sess=sess, soup=soup)
- comments_to_make_dict = []
- for i in range (len(r)):
- temp = 'type: ' + str(rt[i]) + ' date: ' + str(rd[i]) + ' URL: ' + str(ru[i])+ ' the comment: ' + str(r[i])
- comments_to_make_dict.append(temp)
- #this is the code that gets info for players without a college profile
- def no_college(url, sess, soup):
- def get_rankings(url, sess, soup):
- rankings = soup.select('.player-rank')
- ranks = {}
- if len(rankings) > 0:
- for rank_n, rank in enumerate(rankings):
- ranklabel = rank.select_one('.rank-label').text.lower().strip().replace(' ', '_')
- for trsh in rank.select('span'):
- trsh.decompose()
- ranks[ranklabel + '_pos_rank'] = rank.select_one('.pos-rank').text.strip()
- ranks[ranklabel + '_pbr_rank'] = rank.select_one('.pbr-rank').text.strip()
- print(ranks)
- return(ranks)
- #in order to get the state for the state ranking
- def get_info(url, sess, soup):
- info = soup.select('.info-section li > div,'
- ' .info-section li:not(:has(div), .stat-head, .stat-holder), .info-section li > span')
- infos = {}
- for dat_n, dat in enumerate(info):
- val = dat.select_one('strong').text.lower().strip()
- dat.select_one('strong').decompose()
- try:
- if "player-stat" in dat["class"]:
- if dat.select_one('span') is not None:
- date = dat.select_one('span').text.lower().strip().replace("(", "").replace(")", "")
- dat.select_one('span').decompose()
- else:
- date = ""
- except:
- pass
- label = dat.text.lower().strip().replace(' ', '_').replace(":", "")
- infos[label] = val
- try:
- if "player-stat" in dat["class"]:
- infos[label + "_date"] = date
- except:
- pass
- print('hs only infos', infos)
- return(infos)
- #unclear how the comments section on PBR actually works - I think that this is good enough probably
- def get_comments(url, sess, soup):
- comments = soup.select('.player-comments .player-comment')
- reports = []
- reportdates = []
- reporttypes = []
- reporturls = []
- if len(comments) > 0:
- for rep_n, rep in enumerate(comments):
- try:
- url = 'no url at index: ' + str(rep_n)
- url = rep.select('a[href]')
- url = re.findall('"([^"]*)"', str(url))
- # PBR when it writes links sometimes only uses the things that following it's own prepbaseballreport.com so the follow helps grab the useful URL
- for i in range(len(url)):
- if (str(url[i])[:4] == 'http'):
- url[i] = url[i]
- elif (str(url[i][:4]) != 'http'):
- url[i] = 'https://www.prepbaseballreport.com' + str(url[i])
- except:
- url = 'no url at index: ' + str(rep_n)
- try:
- date = 'no date at index: ' + str(rep_n)
- date = rep.select_one('.comment-date').text.strip()
- except:
- date = 'no date at index: ' + str(rep_n)
- report = rep.select_one('.comment').text.strip()
- # label = "report_" + str(rep_n+1)
- reports.append(report)
- reportdates.append(date)
- reporttypes.append('hs')
- reporturls.append(url)
- print('hs only reports', reports)
- return(reports, reportdates, reporttypes, reporturls)
- def get_stats_panel(url, sess, soup):
- statspanels = soup.select('.headlines-list.news-list li:has(.stats-panel)')
- stats = {}
- if len(statspanels) > 0:
- for panel_n, panel in enumerate(statspanels):
- date = panel.select_one('.date').text
- type = panel.select_one('.title').text
- labels = [x.text.strip().lower().replace(' ', '_') for x in panel.select('thead td')]
- val = [x.text.strip().lower().replace(' ', '_') for x in panel.select('td')
- if x not in panel.select('thead td')]
- if len(labels) > 0:
- stats['historical_' + str(panel_n + 1) + "_date"] = date
- stats['historical_' + str(panel_n + 1) + "_type"] = type
- # print(panel)
- for label_n, label in enumerate(labels):
- stats['historical_' + str(panel_n + 1) + "_" + label] = val[label_n]
- print('hs stats', stats)
- return(stats)
- def get_commitments(url, sess, soup):
- commitments = soup.select('.commitment-draft')
- commits = {}
- if len(commitments) > 0:
- for commit in commitments:
- type = commit.select_one('h2').text.strip().lower()
- try:
- val = commit.select_one('a').text.strip().lower()
- try:
- valid = 'https://www.prepbaseballreport.com' + str(commit.select_one('a')['href'].strip().lower())
- except:
- valid = ""
- commits[type] = val
- commits[type + "_link"] = valid
- except:
- val = commit.select_one('p').text.strip().lower()
- commits[type] = val
- if "round" in val:
- val = val.split(", ")
- commits["draftyear"] = val[0].strip()
- commits["draftround"] = val[1].replace("round", "").strip()
- commits["draftteam"] = val[2].strip()
- print('hs only commits', commits)
- return(commits)
- get_rankings(url=url, sess=sess, soup=soup)
- get_info(url=url, sess=sess, soup=soup)
- get_comments(url=url, sess=sess, soup=soup)
- get_stats_panel(url=url, sess=sess, soup=soup)
- get_commitments(url=url, sess=sess, soup=soup)
- ranks = get_rankings(url=url, sess=sess, soup=soup)
- hsinfos = get_info(url=url, sess=sess, soup=soup)
- r, rd, rt, ru = get_comments(url=url, sess=sess, soup=soup)
- hsstats = get_stats_panel(url=url, sess=sess, soup=soup)
- commits = get_commitments(url=url, sess=sess, soup=soup)
- comments_to_make_dict = []
- for i in range(len(r)):
- temp = 'type: ' + str(rt[i]) + ' date: ' + str(rd[i]) + ' URL: ' + str(ru[i]) + ' the comment: ' + str(r[i])
- comments_to_make_dict.append(temp)
- session = requests.session()
- session = pbr_login(session)
- determine_levels_played(url=hs_and_college_example, sess=session)
- #determine_levels_played(url=only_hs_stat_example, sess=session)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement