Untitled

import pandas as pd
from urllib.request import urlopen
from urllib.error import URLError
from bs4 import BeautifulSoup
import sys  # For exiting safely
import sqlalchemy
import mysql.connector
from sqlalchemy.types import TEXT
import math
from random import random
import time
import subprocess
import datetime
import numpy as np
import requests
from itertools import permutations
from itertools import product
from iteration_utilities import deepflatten
import re
import itertools
from inspect import currentframe


#a function to make debugging easier
def get_linenumber():
    cf = currentframe()
    return str(cf.f_back.f_lineno)


pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


#this is the point in the code where we should import the PBR urls we want to use
hs_and_college_example = 'https://www.prepbaseballreport.com/profiles/OH/Jakob-Brustoski-1304697258'
only_hs_stat_example = 'https://www.prepbaseballreport.com/profiles/OH/Jakob-Brustoski-1304697258'

#this function helps you log into PBR
def pbr_login(sess):
    loginurl = 'https://www.prepbaseballreport.com/customer/account/login/'
    loginaction = 'https://www.prepbaseballreport.com/customer/account/loginPost/'
    # session = sess
    r1 = sess.get(loginurl)
    logsoup = BeautifulSoup(r1.text, 'html.parser')
    form_key = logsoup.select_one('#login-form input[name="form_key"]')['value']
    login_data = {'login[username]':'mohammed246@cmailing.com', 'login[password]': 'dinger2034', 'form_key': form_key}
    r = sess.post(loginaction, data=login_data, headers=dict(Referer=loginurl))
    # print(r.text)
    print('Login Success')
    return sess


#this is a function to determine if the player played in highschool, college, or both
#it doesn't seem like there are players who only play in college on PBR but as we try more cases maybe some will come up?
def determine_levels_played(url, sess):
    r = sess.get(url, headers=dict(Referer=url))

    if r.status_code == 404:
        return None

    soup = BeautifulSoup(r.text, 'html.parser')
    soup_tester = soup.select_one('#college-profile')

    if soup_tester is None:
        print('this guy has no college')
        no_college(url=url, sess=sess, soup=soup)

    else:
        college(url=url, sess=sess, soup=soup)


#this is for players with both a college and highschool profil
def college(url, sess, soup):
    print('This is data for a player with both a hs and college profile')

    def get_rankings(url, sess, soup):
        rankings = soup.select('.player-rank')
        ranks = {}
        if len(rankings) > 0:
            for rank_n, rank in enumerate(rankings):
                ranklabel = rank.select_one('.rank-label').text.lower().strip().replace(' ', '_')

                for trsh in rank.select('span'):
                    trsh.decompose()
                ranks[ranklabel + '_pos_rank'] = rank.select_one('.pos-rank').text.strip()
                ranks[ranklabel + '_pbr_rank'] = rank.select_one('.pbr-rank').text.strip()
        print('Here are the rankings: ', ranks)
        return(ranks)

    def get_info(url, sess, soup):
        info = soup.select('.info-section li > div,'
                           ' .info-section li:not(:has(div), .stat-head, .stat-holder), .info-section li > span')
        infos = {}
        for dat_n, dat in enumerate(info):
            val = dat.select_one('strong').text.lower().strip()
            dat.select_one('strong').decompose()
            try:
                if "player-stat" in dat["class"]:
                    if dat.select_one('span') is not None:
                        date = dat.select_one('span').text.lower().strip().replace("(", "").replace(")", "")
                        dat.select_one('span').decompose()
                    else:
                        date = ""
            except:
                pass
            label = dat.text.lower().strip().replace(' ', '_').replace(":", "")
            infos[label] = val
            try:
                if "player-stat" in dat["class"]:
                    infos[label + "_date"] = date
            except:
                pass

        college_r = sess.get(str(url)+'#college-profile', headers=dict(Referer=url))
        college_soup = BeautifulSoup(college_r.text, 'html.parser')

        college_info = college_soup.select('.info-section li > div,'
                           ' .info-section li:not(:has(div), .stat-head, .stat-holder), .info-section li > span')
        college_infos = {}
        for dat_n, dat in enumerate(college_info):
            val = dat.select_one('strong').text.lower().strip()
            dat.select_one('strong').decompose()
            try:
                if "player-stat" in dat["class"]:
                    if dat.select_one('span') is not None:
                        date = dat.select_one('span').text.lower().strip().replace("(", "").replace(")", "")
                        dat.select_one('span').decompose()
                    else:
                        date = ""
            except:
                pass
            label = dat.text.lower().strip().replace(' ', '_').replace(":", "")
            college_infos[label] = val
            try:
                if "player-stat" in dat["class"]:
                    college_infos[label + "_date"] = date
            except:
                pass
        print('hs infos', infos)
        print('college infos', college_infos)
        return(infos, college_infos)

    def get_comments(url, sess, soup):
        comments = soup.select('.player-comments .player-comment')
        reports = []
        reportdates = []
        reporttypes = []
        reporturls = []
        if len(comments) > 0:
            for rep_n, rep in enumerate(comments):
                try:
                    url = 'no url listed at index ' + str(rep_n)
                    url = rep.select_one('a[href]')
                    url = re.findall('"([^"]*)"', str(url))
                    url = 'https://www.prepbaseballreport.com' + str(url[0])
                except:
                    url = 'no url listed at index ' + str(rep_n)
                try:
                    date ='no date listed at index ' + str(rep_n)
                    date = rep.select_one('.comment-date').text.strip()
                except:
                    date = 'no date listed at index ' + str(rep_n)
                report = rep.select_one('.comment').text.strip()
                # label = "report_" + str(rep_n+1)
                reports.append(report)
                reportdates.append(date)
                reporttypes.append('hs')
                reporturls.append(url)

        comments_college_test = soup.select('#college_tab2 > #stats_panel')
        comments_college = soup.select('#college_tab2')
        if len(comments_college_test)> 0:
            for rep_n, rep in enumerate(comments_college):
                try:
                    url = 'no url listed at index ' + str(rep_n)
                    url = rep.select('a[href]')
                    url = re.findall('"([^"]*)"', str(url))
                    #PBR when it writes links sometimes only uses the things that following it's own prepbaseballreport.com so the follow helps grab the useful URL
                    for i in range(len(url)):
                        if (str(url[i])[:4] == 'http'):
                             url[i] = url[i]
                        elif (str(url[i][:4]) != 'http'):
                            url[i] = 'https://www.prepbaseballreport.com' + str(url[i])
                except:
                    url = "no url listed at index " + str(rep_n)
                try:
                    dates = 'no date listed at index ' + str(rep_n)
                    text_to_search = str(rep)
                    pattern = '<strong>(.*?)</strong>'
                    dates = re.findall(pattern, text_to_search)
                except:
                    dates ='no date listed at index ' + str(rep_n)
                report = rep.select_one('.comment').text.strip()

                reports.append(report)
                reportdates.append(dates)
                reporttypes.append('college')
                reporturls.append(url)


            #this will make it easier to understanding the data later on, otherwise we have a list with a list added on to the end
            #the second list contains the college data, but it is hard to track them all together

            def flatten(L):
                for l in L:
                    if isinstance(l, list):
                        yield from flatten(l)
                    else:
                        yield l
            try:
                reports = list(flatten(reports))
            except:
                pass
            try:
                reportdates = list(flatten(reportdates))
            except:
                pass
            try:
                reporttypes = list(flatten(reporttypes))
            except:
                pass
            try:
                reporturls = list(flatten(reporturls))
            except:
                pass
            print('reports', reportdates)

        return(reports, reportdates, reporttypes, reporturls)

    def get_stats_panel(url, sess, soup):
        statspanels = soup.select('.headlines-list.news-list li:has(.stats-panel)')
        stats = {}
        if len(statspanels) > 0:
            for panel_n, panel in enumerate(statspanels):
                date = panel.select_one('.date').text
                type = panel.select_one('.title').text
                labels = [x.text.strip().lower().replace(' ', '_') for x in panel.select('thead td')]
                val = [x.text.strip().lower().replace(' ', '_') for x in panel.select('td')
                       if x not in panel.select('thead td')]
                if len(labels) > 0:
                    stats['historical_' + str(panel_n + 1) + "_date"] = date
                    stats['historical_' + str(panel_n + 1) + "_type"] = type
                # print(panel)
                for label_n, label in enumerate(labels):
                    stats['historical_' + str(panel_n + 1) + "_" + label] = val[label_n]

        college_statspanels = soup.select('#college_tab4')
        college_stats = {}
        if len(college_statspanels) > 0:
            try:
                for panel_n, panel in enumerate(college_statspanels):
                    date = panel.select_one('.date').text
                    type = panel.select_one('.title').text
                    labels = [x.text.strip().lower().replace(' ', '_') for x in panel.select('thead td')]
                    val = [x.text.strip().lower().replace(' ', '_') for x in panel.select('td')
                           if x not in panel.select('thead td')]
                    if len(labels) > 0:
                        college_stats['historical_' + str(panel_n + 1) + "_date"] = date
                        college_stats['historical_' + str(panel_n + 1) + "_type"] = type
                    # print(panel)
                    for label_n, label in enumerate(labels):
                        college_stats['historical_' + str(panel_n + 1) + "_" + label] = val[label_n]
            except:
                college_stats = {}
        print('hs stats', stats)
        print('college stats', college_stats)
        return(stats, college_stats)

    def get_commitments(url, sess, soup):
        commitments = soup.select('.commitment-draft')
        commits = {}
        if len(commitments) > 0:
            for commit in commitments:
                type = commit.select_one('h2').text.strip().lower()
                try:
                    val = commit.select_one('a').text.strip().lower()
                    try:
                        valid = 'https://www.prepbaseballreport.com' + str(commit.select_one('a')['href'].strip().lower())
                    except:
                        valid = ""
                    commits[type] = val
                    commits[type + "_link"] = valid
                except:
                    val = commit.select_one('p').text.strip().lower()
                    commits[type] = val
                    if "round" in val:
                        val = val.split(", ")
                        commits["draftyear"] = val[0].strip()
                        commits["draftround"] = val[1].replace("round", "").strip()
                        commits["draftteam"] = val[2].strip()
        print('commits', commits)
        return commits

    ranks = get_rankings(url=url, sess=sess, soup=soup)
    hsinfos, collegeinfos = get_info(url=url, sess=sess, soup=soup)
    r, rd, rt, ru = get_comments(url=url, sess=sess, soup=soup)
    hsstats, collegestats = get_stats_panel(url=url, sess=sess, soup=soup)
    commits = get_commitments(url=url, sess=sess, soup=soup)

    comments_to_make_dict = []
    for i in range (len(r)):
        temp = 'type: ' + str(rt[i]) + ' date: ' + str(rd[i]) + ' URL: ' + str(ru[i])+ ' the comment: ' + str(r[i])
        comments_to_make_dict.append(temp)


#this is the code that gets info for players without a college profile
def no_college(url, sess, soup):

    def get_rankings(url, sess, soup):
        rankings = soup.select('.player-rank')
        ranks = {}
        if len(rankings) > 0:
            for rank_n, rank in enumerate(rankings):
                ranklabel = rank.select_one('.rank-label').text.lower().strip().replace(' ', '_')

                for trsh in rank.select('span'):
                    trsh.decompose()
                ranks[ranklabel + '_pos_rank'] = rank.select_one('.pos-rank').text.strip()
                ranks[ranklabel + '_pbr_rank'] = rank.select_one('.pbr-rank').text.strip()
        print(ranks)
        return(ranks)

        #in order to get the state for the state ranking

    def get_info(url, sess, soup):
        info = soup.select('.info-section li > div,'
                           ' .info-section li:not(:has(div), .stat-head, .stat-holder), .info-section li > span')
        infos = {}
        for dat_n, dat in enumerate(info):
            val = dat.select_one('strong').text.lower().strip()
            dat.select_one('strong').decompose()
            try:
                if "player-stat" in dat["class"]:
                    if dat.select_one('span') is not None:
                        date = dat.select_one('span').text.lower().strip().replace("(", "").replace(")", "")
                        dat.select_one('span').decompose()
                    else:
                        date = ""
            except:
                pass
            label = dat.text.lower().strip().replace(' ', '_').replace(":", "")
            infos[label] = val
            try:
                if "player-stat" in dat["class"]:
                    infos[label + "_date"] = date
            except:
                pass
        print('hs only infos', infos)
        return(infos)

    #unclear how the comments section on PBR actually works - I think that this is good enough probably
    def get_comments(url, sess, soup):
        comments = soup.select('.player-comments .player-comment')
        reports = []
        reportdates = []
        reporttypes = []
        reporturls = []
        if len(comments) > 0:
            for rep_n, rep in enumerate(comments):
                try:
                    url = 'no url at index: ' + str(rep_n)
                    url = rep.select('a[href]')
                    url = re.findall('"([^"]*)"', str(url))

                    # PBR when it writes links sometimes only uses the things that following it's own prepbaseballreport.com so the follow helps grab the useful URL
                    for i in range(len(url)):
                        if (str(url[i])[:4] == 'http'):
                            url[i] = url[i]
                        elif (str(url[i][:4]) != 'http'):
                            url[i] = 'https://www.prepbaseballreport.com' + str(url[i])
                except:
                    url = 'no url at index: ' + str(rep_n)
                try:
                    date = 'no date at index: ' + str(rep_n)
                    date = rep.select_one('.comment-date').text.strip()
                except:
                    date = 'no date at index: ' + str(rep_n)
                report = rep.select_one('.comment').text.strip()
                # label = "report_" + str(rep_n+1)
                reports.append(report)
                reportdates.append(date)
                reporttypes.append('hs')
                reporturls.append(url)
        print('hs only reports', reports)
        return(reports, reportdates, reporttypes, reporturls)

    def get_stats_panel(url, sess, soup):
        statspanels = soup.select('.headlines-list.news-list li:has(.stats-panel)')
        stats = {}
        if len(statspanels) > 0:
            for panel_n, panel in enumerate(statspanels):
                date = panel.select_one('.date').text
                type = panel.select_one('.title').text
                labels = [x.text.strip().lower().replace(' ', '_') for x in panel.select('thead td')]
                val = [x.text.strip().lower().replace(' ', '_') for x in panel.select('td')
                       if x not in panel.select('thead td')]
                if len(labels) > 0:
                    stats['historical_' + str(panel_n + 1) + "_date"] = date
                    stats['historical_' + str(panel_n + 1) + "_type"] = type
                # print(panel)
                for label_n, label in enumerate(labels):
                    stats['historical_' + str(panel_n + 1) + "_" + label] = val[label_n]

        print('hs stats', stats)
        return(stats)


    def get_commitments(url, sess, soup):
        commitments = soup.select('.commitment-draft')
        commits = {}
        if len(commitments) > 0:
            for commit in commitments:
                type = commit.select_one('h2').text.strip().lower()
                try:
                    val = commit.select_one('a').text.strip().lower()
                    try:
                        valid = 'https://www.prepbaseballreport.com' + str(commit.select_one('a')['href'].strip().lower())
                    except:
                        valid = ""
                    commits[type] = val
                    commits[type + "_link"] = valid
                except:
                    val = commit.select_one('p').text.strip().lower()
                    commits[type] = val
                    if "round" in val:
                        val = val.split(", ")
                        commits["draftyear"] = val[0].strip()
                        commits["draftround"] = val[1].replace("round", "").strip()
                        commits["draftteam"] = val[2].strip()
        print('hs only commits', commits)
        return(commits)

    get_rankings(url=url, sess=sess, soup=soup)
    get_info(url=url, sess=sess, soup=soup)
    get_comments(url=url, sess=sess, soup=soup)
    get_stats_panel(url=url, sess=sess, soup=soup)
    get_commitments(url=url, sess=sess, soup=soup)

    ranks = get_rankings(url=url, sess=sess, soup=soup)
    hsinfos = get_info(url=url, sess=sess, soup=soup)
    r, rd, rt, ru = get_comments(url=url, sess=sess, soup=soup)
    hsstats = get_stats_panel(url=url, sess=sess, soup=soup)
    commits = get_commitments(url=url, sess=sess, soup=soup)

    comments_to_make_dict = []
    for i in range(len(r)):
        temp = 'type: ' + str(rt[i]) + ' date: ' + str(rd[i]) + ' URL: ' + str(ru[i]) + ' the comment: ' + str(r[i])
        comments_to_make_dict.append(temp)


session = requests.session()
session = pbr_login(session)
determine_levels_played(url=hs_and_college_example, sess=session)
#determine_levels_played(url=only_hs_stat_example, sess=session)