Advertisement
Guest User

Untitled

a guest
Apr 4th, 2020
435
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 19.71 KB | None | 0 0
  1. import pandas as pd
  2. from urllib.request import urlopen
  3. from urllib.error import URLError
  4. from bs4 import BeautifulSoup
  5. import sys  # For exiting safely
  6. import sqlalchemy
  7. import mysql.connector
  8. from sqlalchemy.types import TEXT
  9. import math
  10. from random import random
  11. import time
  12. import subprocess
  13. import datetime
  14. import numpy as np
  15. import requests
  16. from itertools import permutations
  17. from itertools import product
  18. from iteration_utilities import deepflatten
  19. import re
  20. import itertools
  21. from inspect import currentframe
  22.  
  23.  
  24. #a function to make debugging easier
  25. def get_linenumber():
  26.     cf = currentframe()
  27.     return str(cf.f_back.f_lineno)
  28.  
  29.  
  30. pd.set_option('display.max_columns', 500)
  31. pd.set_option('display.width', 1000)
  32.  
  33.  
  34. #this is the point in the code where we should import the PBR urls we want to use
  35. hs_and_college_example = 'https://www.prepbaseballreport.com/profiles/OH/Jakob-Brustoski-1304697258'
  36. only_hs_stat_example = 'https://www.prepbaseballreport.com/profiles/OH/Jakob-Brustoski-1304697258'
  37.  
  38. #this function helps you log into PBR
  39. def pbr_login(sess):
  40.     loginurl = 'https://www.prepbaseballreport.com/customer/account/login/'
  41.     loginaction = 'https://www.prepbaseballreport.com/customer/account/loginPost/'
  42.     # session = sess
  43.     r1 = sess.get(loginurl)
  44.     logsoup = BeautifulSoup(r1.text, 'html.parser')
  45.     form_key = logsoup.select_one('#login-form input[name="form_key"]')['value']
  46.     login_data = {'login[username]':'mohammed246@cmailing.com', 'login[password]': 'dinger2034', 'form_key': form_key}
  47.     r = sess.post(loginaction, data=login_data, headers=dict(Referer=loginurl))
  48.     # print(r.text)
  49.     print('Login Success')
  50.     return sess
  51.  
  52.  
  53. #this is a function to determine if the player played in highschool, college, or both
  54. #it doesn't seem like there are players who only play in college on PBR but as we try more cases maybe some will come up?
  55. def determine_levels_played(url, sess):
  56.     r = sess.get(url, headers=dict(Referer=url))
  57.  
  58.     if r.status_code == 404:
  59.         return None
  60.  
  61.     soup = BeautifulSoup(r.text, 'html.parser')
  62.     soup_tester = soup.select_one('#college-profile')
  63.  
  64.     if soup_tester is None:
  65.         print('this guy has no college')
  66.         no_college(url=url, sess=sess, soup=soup)
  67.  
  68.     else:
  69.         college(url=url, sess=sess, soup=soup)
  70.  
  71.  
  72. #this is for players with both a college and highschool profil
  73. def college(url, sess, soup):
  74.     print('This is data for a player with both a hs and college profile')
  75.  
  76.     def get_rankings(url, sess, soup):
  77.         rankings = soup.select('.player-rank')
  78.         ranks = {}
  79.         if len(rankings) > 0:
  80.             for rank_n, rank in enumerate(rankings):
  81.                 ranklabel = rank.select_one('.rank-label').text.lower().strip().replace(' ', '_')
  82.  
  83.                 for trsh in rank.select('span'):
  84.                     trsh.decompose()
  85.                 ranks[ranklabel + '_pos_rank'] = rank.select_one('.pos-rank').text.strip()
  86.                 ranks[ranklabel + '_pbr_rank'] = rank.select_one('.pbr-rank').text.strip()
  87.         print('Here are the rankings: ', ranks)
  88.         return(ranks)
  89.  
  90.     def get_info(url, sess, soup):
  91.         info = soup.select('.info-section li > div,'
  92.                            ' .info-section li:not(:has(div), .stat-head, .stat-holder), .info-section li > span')
  93.         infos = {}
  94.         for dat_n, dat in enumerate(info):
  95.             val = dat.select_one('strong').text.lower().strip()
  96.             dat.select_one('strong').decompose()
  97.             try:
  98.                 if "player-stat" in dat["class"]:
  99.                     if dat.select_one('span') is not None:
  100.                         date = dat.select_one('span').text.lower().strip().replace("(", "").replace(")", "")
  101.                         dat.select_one('span').decompose()
  102.                     else:
  103.                         date = ""
  104.             except:
  105.                 pass
  106.             label = dat.text.lower().strip().replace(' ', '_').replace(":", "")
  107.             infos[label] = val
  108.             try:
  109.                 if "player-stat" in dat["class"]:
  110.                     infos[label + "_date"] = date
  111.             except:
  112.                 pass
  113.  
  114.         college_r = sess.get(str(url)+'#college-profile', headers=dict(Referer=url))
  115.         college_soup = BeautifulSoup(college_r.text, 'html.parser')
  116.  
  117.         college_info = college_soup.select('.info-section li > div,'
  118.                            ' .info-section li:not(:has(div), .stat-head, .stat-holder), .info-section li > span')
  119.         college_infos = {}
  120.         for dat_n, dat in enumerate(college_info):
  121.             val = dat.select_one('strong').text.lower().strip()
  122.             dat.select_one('strong').decompose()
  123.             try:
  124.                 if "player-stat" in dat["class"]:
  125.                     if dat.select_one('span') is not None:
  126.                         date = dat.select_one('span').text.lower().strip().replace("(", "").replace(")", "")
  127.                         dat.select_one('span').decompose()
  128.                     else:
  129.                         date = ""
  130.             except:
  131.                 pass
  132.             label = dat.text.lower().strip().replace(' ', '_').replace(":", "")
  133.             college_infos[label] = val
  134.             try:
  135.                 if "player-stat" in dat["class"]:
  136.                     college_infos[label + "_date"] = date
  137.             except:
  138.                 pass
  139.         print('hs infos', infos)
  140.         print('college infos', college_infos)
  141.         return(infos, college_infos)
  142.  
  143.     def get_comments(url, sess, soup):
  144.         comments = soup.select('.player-comments .player-comment')
  145.         reports = []
  146.         reportdates = []
  147.         reporttypes = []
  148.         reporturls = []
  149.         if len(comments) > 0:
  150.             for rep_n, rep in enumerate(comments):
  151.                 try:
  152.                     url = 'no url listed at index ' + str(rep_n)
  153.                     url = rep.select_one('a[href]')
  154.                     url = re.findall('"([^"]*)"', str(url))
  155.                     url = 'https://www.prepbaseballreport.com' + str(url[0])
  156.                 except:
  157.                     url = 'no url listed at index ' + str(rep_n)
  158.                 try:
  159.                     date ='no date listed at index ' + str(rep_n)
  160.                     date = rep.select_one('.comment-date').text.strip()
  161.                 except:
  162.                     date = 'no date listed at index ' + str(rep_n)
  163.                 report = rep.select_one('.comment').text.strip()
  164.                 # label = "report_" + str(rep_n+1)
  165.                 reports.append(report)
  166.                 reportdates.append(date)
  167.                 reporttypes.append('hs')
  168.                 reporturls.append(url)
  169.  
  170.         comments_college_test = soup.select('#college_tab2 > #stats_panel')
  171.         comments_college = soup.select('#college_tab2')
  172.         if len(comments_college_test)> 0:
  173.             for rep_n, rep in enumerate(comments_college):
  174.                 try:
  175.                     url = 'no url listed at index ' + str(rep_n)
  176.                     url = rep.select('a[href]')
  177.                     url = re.findall('"([^"]*)"', str(url))
  178.                     #PBR when it writes links sometimes only uses the things that following it's own prepbaseballreport.com so the follow helps grab the useful URL
  179.                     for i in range(len(url)):
  180.                         if (str(url[i])[:4] == 'http'):
  181.                              url[i] = url[i]
  182.                         elif (str(url[i][:4]) != 'http'):
  183.                             url[i] = 'https://www.prepbaseballreport.com' + str(url[i])
  184.                 except:
  185.                     url = "no url listed at index " + str(rep_n)
  186.                 try:
  187.                     dates = 'no date listed at index ' + str(rep_n)
  188.                     text_to_search = str(rep)
  189.                     pattern = '<strong>(.*?)</strong>'
  190.                     dates = re.findall(pattern, text_to_search)
  191.                 except:
  192.                     dates ='no date listed at index ' + str(rep_n)
  193.                 report = rep.select_one('.comment').text.strip()
  194.  
  195.                 reports.append(report)
  196.                 reportdates.append(dates)
  197.                 reporttypes.append('college')
  198.                 reporturls.append(url)
  199.  
  200.  
  201.  
  202.             #this will make it easier to understanding the data later on, otherwise we have a list with a list added on to the end
  203.             #the second list contains the college data, but it is hard to track them all together
  204.  
  205.             def flatten(L):
  206.                 for l in L:
  207.                     if isinstance(l, list):
  208.                         yield from flatten(l)
  209.                     else:
  210.                         yield l
  211.             try:
  212.                 reports = list(flatten(reports))
  213.             except:
  214.                 pass
  215.             try:
  216.                 reportdates = list(flatten(reportdates))
  217.             except:
  218.                 pass
  219.             try:
  220.                 reporttypes = list(flatten(reporttypes))
  221.             except:
  222.                 pass
  223.             try:
  224.                 reporturls = list(flatten(reporturls))
  225.             except:
  226.                 pass
  227.             print('reports', reportdates)
  228.  
  229.         return(reports, reportdates, reporttypes, reporturls)
  230.  
  231.     def get_stats_panel(url, sess, soup):
  232.         statspanels = soup.select('.headlines-list.news-list li:has(.stats-panel)')
  233.         stats = {}
  234.         if len(statspanels) > 0:
  235.             for panel_n, panel in enumerate(statspanels):
  236.                 date = panel.select_one('.date').text
  237.                 type = panel.select_one('.title').text
  238.                 labels = [x.text.strip().lower().replace(' ', '_') for x in panel.select('thead td')]
  239.                 val = [x.text.strip().lower().replace(' ', '_') for x in panel.select('td')
  240.                        if x not in panel.select('thead td')]
  241.                 if len(labels) > 0:
  242.                     stats['historical_' + str(panel_n + 1) + "_date"] = date
  243.                     stats['historical_' + str(panel_n + 1) + "_type"] = type
  244.                 # print(panel)
  245.                 for label_n, label in enumerate(labels):
  246.                     stats['historical_' + str(panel_n + 1) + "_" + label] = val[label_n]
  247.  
  248.         college_statspanels = soup.select('#college_tab4')
  249.         college_stats = {}
  250.         if len(college_statspanels) > 0:
  251.             try:
  252.                 for panel_n, panel in enumerate(college_statspanels):
  253.                     date = panel.select_one('.date').text
  254.                     type = panel.select_one('.title').text
  255.                     labels = [x.text.strip().lower().replace(' ', '_') for x in panel.select('thead td')]
  256.                     val = [x.text.strip().lower().replace(' ', '_') for x in panel.select('td')
  257.                            if x not in panel.select('thead td')]
  258.                     if len(labels) > 0:
  259.                         college_stats['historical_' + str(panel_n + 1) + "_date"] = date
  260.                         college_stats['historical_' + str(panel_n + 1) + "_type"] = type
  261.                     # print(panel)
  262.                     for label_n, label in enumerate(labels):
  263.                         college_stats['historical_' + str(panel_n + 1) + "_" + label] = val[label_n]
  264.             except:
  265.                 college_stats = {}
  266.         print('hs stats', stats)
  267.         print('college stats', college_stats)
  268.         return(stats, college_stats)
  269.  
  270.     def get_commitments(url, sess, soup):
  271.         commitments = soup.select('.commitment-draft')
  272.         commits = {}
  273.         if len(commitments) > 0:
  274.             for commit in commitments:
  275.                 type = commit.select_one('h2').text.strip().lower()
  276.                 try:
  277.                     val = commit.select_one('a').text.strip().lower()
  278.                     try:
  279.                         valid = 'https://www.prepbaseballreport.com' + str(commit.select_one('a')['href'].strip().lower())
  280.                     except:
  281.                         valid = ""
  282.                     commits[type] = val
  283.                     commits[type + "_link"] = valid
  284.                 except:
  285.                     val = commit.select_one('p').text.strip().lower()
  286.                     commits[type] = val
  287.                     if "round" in val:
  288.                         val = val.split(", ")
  289.                         commits["draftyear"] = val[0].strip()
  290.                         commits["draftround"] = val[1].replace("round", "").strip()
  291.                         commits["draftteam"] = val[2].strip()
  292.         print('commits', commits)
  293.         return commits
  294.  
  295.     ranks = get_rankings(url=url, sess=sess, soup=soup)
  296.     hsinfos, collegeinfos = get_info(url=url, sess=sess, soup=soup)
  297.     r, rd, rt, ru = get_comments(url=url, sess=sess, soup=soup)
  298.     hsstats, collegestats = get_stats_panel(url=url, sess=sess, soup=soup)
  299.     commits = get_commitments(url=url, sess=sess, soup=soup)
  300.  
  301.     comments_to_make_dict = []
  302.     for i in range (len(r)):
  303.         temp = 'type: ' + str(rt[i]) + ' date: ' + str(rd[i]) + ' URL: ' + str(ru[i])+ ' the comment: ' + str(r[i])
  304.         comments_to_make_dict.append(temp)
  305.  
  306.  
  307.  
  308. #this is the code that gets info for players without a college profile
  309. def no_college(url, sess, soup):
  310.  
  311.     def get_rankings(url, sess, soup):
  312.         rankings = soup.select('.player-rank')
  313.         ranks = {}
  314.         if len(rankings) > 0:
  315.             for rank_n, rank in enumerate(rankings):
  316.                 ranklabel = rank.select_one('.rank-label').text.lower().strip().replace(' ', '_')
  317.  
  318.                 for trsh in rank.select('span'):
  319.                     trsh.decompose()
  320.                 ranks[ranklabel + '_pos_rank'] = rank.select_one('.pos-rank').text.strip()
  321.                 ranks[ranklabel + '_pbr_rank'] = rank.select_one('.pbr-rank').text.strip()
  322.         print(ranks)
  323.         return(ranks)
  324.  
  325.         #in order to get the state for the state ranking
  326.  
  327.     def get_info(url, sess, soup):
  328.         info = soup.select('.info-section li > div,'
  329.                            ' .info-section li:not(:has(div), .stat-head, .stat-holder), .info-section li > span')
  330.         infos = {}
  331.         for dat_n, dat in enumerate(info):
  332.             val = dat.select_one('strong').text.lower().strip()
  333.             dat.select_one('strong').decompose()
  334.             try:
  335.                 if "player-stat" in dat["class"]:
  336.                     if dat.select_one('span') is not None:
  337.                         date = dat.select_one('span').text.lower().strip().replace("(", "").replace(")", "")
  338.                         dat.select_one('span').decompose()
  339.                     else:
  340.                         date = ""
  341.             except:
  342.                 pass
  343.             label = dat.text.lower().strip().replace(' ', '_').replace(":", "")
  344.             infos[label] = val
  345.             try:
  346.                 if "player-stat" in dat["class"]:
  347.                     infos[label + "_date"] = date
  348.             except:
  349.                 pass
  350.         print('hs only infos', infos)
  351.         return(infos)
  352.  
  353.     #unclear how the comments section on PBR actually works - I think that this is good enough probably
  354.     def get_comments(url, sess, soup):
  355.         comments = soup.select('.player-comments .player-comment')
  356.         reports = []
  357.         reportdates = []
  358.         reporttypes = []
  359.         reporturls = []
  360.         if len(comments) > 0:
  361.             for rep_n, rep in enumerate(comments):
  362.                 try:
  363.                     url = 'no url at index: ' + str(rep_n)
  364.                     url = rep.select('a[href]')
  365.                     url = re.findall('"([^"]*)"', str(url))
  366.  
  367.                     # PBR when it writes links sometimes only uses the things that following it's own prepbaseballreport.com so the follow helps grab the useful URL
  368.                     for i in range(len(url)):
  369.                         if (str(url[i])[:4] == 'http'):
  370.                             url[i] = url[i]
  371.                         elif (str(url[i][:4]) != 'http'):
  372.                             url[i] = 'https://www.prepbaseballreport.com' + str(url[i])
  373.                 except:
  374.                     url = 'no url at index: ' + str(rep_n)
  375.                 try:
  376.                     date = 'no date at index: ' + str(rep_n)
  377.                     date = rep.select_one('.comment-date').text.strip()
  378.                 except:
  379.                     date = 'no date at index: ' + str(rep_n)
  380.                 report = rep.select_one('.comment').text.strip()
  381.                 # label = "report_" + str(rep_n+1)
  382.                 reports.append(report)
  383.                 reportdates.append(date)
  384.                 reporttypes.append('hs')
  385.                 reporturls.append(url)
  386.         print('hs only reports', reports)
  387.         return(reports, reportdates, reporttypes, reporturls)
  388.  
  389.     def get_stats_panel(url, sess, soup):
  390.         statspanels = soup.select('.headlines-list.news-list li:has(.stats-panel)')
  391.         stats = {}
  392.         if len(statspanels) > 0:
  393.             for panel_n, panel in enumerate(statspanels):
  394.                 date = panel.select_one('.date').text
  395.                 type = panel.select_one('.title').text
  396.                 labels = [x.text.strip().lower().replace(' ', '_') for x in panel.select('thead td')]
  397.                 val = [x.text.strip().lower().replace(' ', '_') for x in panel.select('td')
  398.                        if x not in panel.select('thead td')]
  399.                 if len(labels) > 0:
  400.                     stats['historical_' + str(panel_n + 1) + "_date"] = date
  401.                     stats['historical_' + str(panel_n + 1) + "_type"] = type
  402.                 # print(panel)
  403.                 for label_n, label in enumerate(labels):
  404.                     stats['historical_' + str(panel_n + 1) + "_" + label] = val[label_n]
  405.  
  406.         print('hs stats', stats)
  407.         return(stats)
  408.  
  409.  
  410.     def get_commitments(url, sess, soup):
  411.         commitments = soup.select('.commitment-draft')
  412.         commits = {}
  413.         if len(commitments) > 0:
  414.             for commit in commitments:
  415.                 type = commit.select_one('h2').text.strip().lower()
  416.                 try:
  417.                     val = commit.select_one('a').text.strip().lower()
  418.                     try:
  419.                         valid = 'https://www.prepbaseballreport.com' + str(commit.select_one('a')['href'].strip().lower())
  420.                     except:
  421.                         valid = ""
  422.                     commits[type] = val
  423.                     commits[type + "_link"] = valid
  424.                 except:
  425.                     val = commit.select_one('p').text.strip().lower()
  426.                     commits[type] = val
  427.                     if "round" in val:
  428.                         val = val.split(", ")
  429.                         commits["draftyear"] = val[0].strip()
  430.                         commits["draftround"] = val[1].replace("round", "").strip()
  431.                         commits["draftteam"] = val[2].strip()
  432.         print('hs only commits', commits)
  433.         return(commits)
  434.  
  435.     get_rankings(url=url, sess=sess, soup=soup)
  436.     get_info(url=url, sess=sess, soup=soup)
  437.     get_comments(url=url, sess=sess, soup=soup)
  438.     get_stats_panel(url=url, sess=sess, soup=soup)
  439.     get_commitments(url=url, sess=sess, soup=soup)
  440.  
  441.     ranks = get_rankings(url=url, sess=sess, soup=soup)
  442.     hsinfos = get_info(url=url, sess=sess, soup=soup)
  443.     r, rd, rt, ru = get_comments(url=url, sess=sess, soup=soup)
  444.     hsstats = get_stats_panel(url=url, sess=sess, soup=soup)
  445.     commits = get_commitments(url=url, sess=sess, soup=soup)
  446.  
  447.     comments_to_make_dict = []
  448.     for i in range(len(r)):
  449.         temp = 'type: ' + str(rt[i]) + ' date: ' + str(rd[i]) + ' URL: ' + str(ru[i]) + ' the comment: ' + str(r[i])
  450.         comments_to_make_dict.append(temp)
  451.  
  452.  
  453. session = requests.session()
  454. session = pbr_login(session)
  455. determine_levels_played(url=hs_and_college_example, sess=session)
  456. #determine_levels_played(url=only_hs_stat_example, sess=session)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement