Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import xlwt
- from bs4 import BeautifulSoup
- from selenium import webdriver
- from selenium.webdriver.common.action_chains import ActionChains
- import time
- from xlutils.copy import copy
- from xlrd import open_workbook
- class MsportScraper:
- def __init__(self):
- self.driver = webdriver.Firefox()
- def get_links_on_matches(self):
- links_list = []
- self.driver.get("http://live.7msport.com/pk_live_en.aspx?view=all&line=no")
- links_soup = BeautifulSoup(self.driver.page_source, 'lxml')
- for link_data in links_soup.find_all('td', class_='data'):
- analyse_js_link = link_data.find_all('a')[0]
- match_id = re.findall(r'\d+', str(analyse_js_link))[0]
- links_list.append(match_id)
- return links_list
- def scroll_shim(self, object):
- passed_in_driver = self.driver
- x = object.location['x']
- y = object.location['y']
- scroll_by_coord = 'window.scrollTo(%s,%s);' % (
- x,
- y
- )
- scroll_nav_out_of_way = 'window.scrollBy(0, -120);'
- passed_in_driver.execute_script(scroll_by_coord)
- passed_in_driver.execute_script(scroll_nav_out_of_way)
- @staticmethod
- def write_results_to_excel(results_list, row, workbook_path=None):
- workbook_created = False
- if workbook_path is None:
- timestamp = time.time()
- book = xlwt.Workbook(encoding="utf-8")
- book.add_sheet("Sheet 1")
- workbook_path = f"results_{timestamp}.xls"
- book.save(workbook_path)
- workbook_created = True
- rb = open_workbook(workbook_path, formatting_info=True)
- wb = copy(rb) # a writable copy (I can't read values out of this, only write to it)
- w_sheet = wb.get_sheet(0)
- for i in range(len(results_list)):
- w_sheet.write(row, i, results_list[i])
- wb.save(workbook_path)
- if workbook_created:
- return workbook_path
- def find_head_to_heads(self, analyse_data):
- over_under = re.findall(r'Totally, (\d+) game\(s\) over, (\d+)', analyse_data)
- try:
- over_under_ratio = int(over_under[0][0]) / (int(over_under[0][0]) + int(over_under[0][1]))
- except ZeroDivisionError:
- over_under_ratio = 1.0
- except IndexError:
- over_under_ratio = None
- over_under2 = re.findall(r'(\d+) game\(s\) half-game over, (\d+)', analyse_data)
- try:
- over_under2_ratio = int(over_under2[0][0]) / (int(over_under2[0][0]) + int(over_under2[0][1]))
- except ZeroDivisionError:
- over_under2_ratio = 1.0
- except IndexError:
- over_under2_ratio = None
- head_to_heads = dict()
- head_to_heads['over_under0.75'] = over_under2_ratio
- head_to_heads['over_under2.5'] = over_under_ratio
- return head_to_heads
- @staticmethod
- def matches_number(soup):
- match_inf = soup.find('div', class_='ana_count').get_text()
- return int(re.findall(r'(\d+) match\(es\) in total', match_inf)[0])
- def scrape_match_information(self):
- match_ids = self.get_links_on_matches()
- soup_main_page = BeautifulSoup(self.driver.page_source, 'lxml')
- workbook_path = None
- row = 0
- print(match_ids)
- for match_id in match_ids:
- match = soup_main_page.find('tr', id=f'bh{match_id}')
- league = match.find('td', class_='match').get_text()
- match_time = match.find('td', class_='time').get_text()
- match_name = match.find('td', class_='home').get_text() + " vs " + match.find('td',
- class_='away').get_text()
- try:
- handicap = match.find('a', class_='rq').get_text()
- except AttributeError:
- handicap = None
- self.driver.get(f"http://analyse.7msport.com/{match_id}/index.shtml")
- soup = BeautifulSoup(self.driver.page_source, 'lxml')
- total_home_team = soup.find('div', id='divTeamHistoryA0')
- try:
- a1_elem = self.driver.find_element_by_id('ddTabA1')
- ActionChains(self.driver).move_to_element(a1_elem).perform()
- soup = BeautifulSoup(self.driver.page_source, 'lxml')
- home_home_team = soup.find('div', id='divTeamHistoryA1')
- except Exception:
- home_home_team = None
- b0_elem = self.driver.find_element_by_id('ddTabB0')
- self.scroll_shim(b0_elem)
- actions = ActionChains(self.driver).move_to_element(
- self.driver.find_element_by_id('divTeamHistoryB0'))
- actions.perform()
- soup = BeautifulSoup(self.driver.page_source, 'lxml')
- total_away_team = soup.find('div', id='divTeamHistoryB0')
- try:
- b1_elem = self.driver.find_element_by_id('ddTabB1')
- ActionChains(self.driver).move_to_element(b1_elem).perform()
- soup = BeautifulSoup(self.driver.page_source, 'lxml')
- away_away_team = soup.find('div', id='divTeamHistoryB1')
- except Exception:
- away_away_team = None
- soup = BeautifulSoup(self.driver.page_source, 'lxml')
- home = soup.find(id='dtTeamHistoryTitleA').get_text().split(" - ")[0]
- away = soup.find(id='dtTeamHistoryTitleB').get_text().split(" - ")[0]
- time.sleep(4)
- if not home_home_team is None:
- total_matches = self.matches_number(home_home_team)
- home_ground_goal_ratio = 0
- home_ground_goal_conceded_ratio = 0
- for match_information in home_home_team.find_all('tr', {'class': ['sjt1', 'sjt2']}):
- count = 0
- for information in match_information.find_all('td'):
- count += 1
- if count == 4:
- home_ground_goal_ratio += int(information.get_text().split('-')[0])
- home_ground_goal_conceded_ratio += int(information.get_text().split('-')[1])
- home_ground_goal_ratio /= total_matches
- home_ground_goal_conceded_ratio /= total_matches
- else:
- home_ground_goal_ratio = None
- home_ground_goal_conceded_ratio = None
- total_goal_conceded_ratio = 0
- total_goal_ratio = 0
- for match_information in total_home_team.find_all('tr', {'class': ['sjt1', 'sjt2']}):
- count = 0
- home_first_flag = False
- for information in match_information.find_all('td'):
- count += 1
- if home in information.get_text():
- home_first_flag = True
- else:
- pass
- if count == 4 and home_first_flag:
- total_goal_ratio += int(information.get_text().split('-')[0])
- total_goal_conceded_ratio += int(information.get_text().split('-')[1])
- elif count == 4:
- total_goal_ratio += int(information.get_text().split('-')[1])
- total_goal_conceded_ratio += int(information.get_text().split('-')[0])
- total_matches = self.matches_number(total_home_team)
- total_goal_ratio /= total_matches
- total_goal_conceded_ratio /= total_matches
- away_total_goal_ratio = 0
- away_total_goal_conceded_ratio = 0
- for match_information in total_away_team.find_all('tr', {'class': ['sjt3', 'sjt4']}):
- away_first_flag = False
- count = 0
- for information in match_information.find_all('td'):
- count += 1
- if away in information.get_text():
- away_first_flag = True
- else:
- pass
- if count == 4 and away_first_flag:
- away_total_goal_ratio += int(information.get_text().split('-')[0])
- away_total_goal_conceded_ratio += int(information.get_text().split('-')[1])
- elif count == 4:
- away_total_goal_conceded_ratio += int(information.get_text().split('-')[0])
- away_total_goal_ratio += int(information.get_text().split('-')[1])
- total_matches = self.matches_number(total_away_team)
- away_total_goal_ratio /= total_matches
- away_total_goal_conceded_ratio /= total_matches
- if not away_away_team is None:
- away_ground_goal_conceded_ratio = 0
- away_ground_goal_ratio = 0
- matches_number = self.matches_number(away_away_team)
- for match_information in away_away_team.find_all('tr', {'class': ['sjt3', 'sjt4']}):
- count = 0
- for information in match_information.find_all('td'):
- count += 1
- if count == 4:
- away_ground_goal_ratio += int(information.get_text().split('-')[1])
- away_ground_goal_conceded_ratio += int(information.get_text().split('-')[0])
- away_ground_goal_ratio /= matches_number
- away_ground_goal_conceded_ratio /= matches_number
- else:
- away_ground_goal_ratio = None
- away_ground_goal_conceded_ratio = None
- p_confrontation = soup.find('div', id='jfwj_body').find('p', class_='ana_count').get_text()
- if p_confrontation == "":
- results = {'over_under2.5': None, "over_under0.75": None}
- results_home_ground = {'over_under2.5': None, "over_under0.75": None}
- else:
- results = self.find_head_to_heads(p_confrontation)
- self.driver.find_element_by_id('WJchk').click()
- time.sleep(1)
- soup = BeautifulSoup(self.driver.page_source, 'lxml')
- p_confrontation = soup.find('div', id='jfwj_body').find('p', class_='ana_count').get_text()
- results_home_ground = self.find_head_to_heads(p_confrontation)
- p_confrontation = total_home_team.find('div', class_='ana_count').get_text()
- total_home_results = self.find_head_to_heads(p_confrontation)
- if not home_home_team is None:
- p_confrontation = home_home_team.find('div', class_='ana_count').get_text()
- home_home_results = self.find_head_to_heads(p_confrontation)
- else:
- home_home_results = {'over_under2.5': None, "over_under0.75": None}
- p_confrontation = total_away_team.find('div', class_='ana_count').get_text()
- total_away_results = self.find_head_to_heads(p_confrontation)
- if not away_away_team is None:
- p_confrontation = away_away_team.find('div', class_='ana_count').get_text()
- away_away_results = self.find_head_to_heads(p_confrontation)
- else:
- away_away_results = {'over_under2.5': None, "over_under0.75": None}
- results_list = [league,
- match_time,
- match_name,
- handicap,
- total_goal_ratio,
- home_ground_goal_ratio,
- total_goal_conceded_ratio,
- home_ground_goal_conceded_ratio,
- away_total_goal_ratio,
- away_ground_goal_ratio,
- away_total_goal_conceded_ratio,
- away_ground_goal_conceded_ratio,
- results['over_under2.5'],
- results['over_under0.75'],
- results_home_ground['over_under2.5'],
- results_home_ground['over_under0.75'],
- total_home_results['over_under2.5'],
- total_home_results['over_under0.75'],
- home_home_results['over_under2.5'],
- home_home_results['over_under0.75'],
- total_home_results['over_under2.5'],
- total_home_results['over_under0.75'],
- total_away_results['over_under2.5'],
- total_away_results['over_under0.75'],
- away_away_results['over_under2.5'],
- away_away_results['over_under0.75']]
- if workbook_path is None:
- workbook_path = self.write_results_to_excel(results_list, row)
- else:
- self.write_results_to_excel(results_list, row, workbook_path)
- row += 1
- m = MsportScraper()
- m.scrape_match_information()
- ['3992506', '3992285', '3916866', '3991546', '3992538', '3992557', '3991858', '3963106', '3884508', '3991536', '3992510', '3966216', '3990988', '3992586', '3992527', '3924463', '3924464', '3924466', '3992542', '3943380', '3992509', '3992507', '3992519', '3992590', '3991891', '3991895', '3991892', '3992511', '3936743', '3934252', '3934251', '3933256', '3933260', '3931326', '3931327', '3932001', '3932003', '3932004', '3931881', '3931879', '3931880', '3931882', '3916863', '3992587', '3992588', '3962330', '3992526', '3992523', '3992524', '3992525', '3992529', '3992531', '3992528', '3992530', '3901641', '3924461', '3992543', '3992602', '3991897', '3991893', '3965862', '3966217', '3966087', '3966215', '3960158', '3960503', '3960504', '3960501', '3958996', '3992290', '3992518', '3966213', '3966214', '3894005', '3936742', '3931328', '3916867', '3927597', '3937626', '3931541', '3992522', '3992521', '3945065', '3986477', '3941939', '3941934', '3941936', '3941935', '3941938', '3932435', '3932436', '3932437', '3922638', '3992516', '3992540', '3966852', '3992532', '3984255', '3884507', '3936744', '3992591', '3992555', '3992585', '3992545', '3992547', '3992548', '3992544', '3992546', '3992550', '3992515', '3992517', '3991896', '3992539', '3991894', '3991851', '3909576', '3904921', '3904922', '3904919', '3926362', '3933000', '3934140', '3934141', '3934142', '3934138', '3939676', '3936755', '3919876', '3919881', '3942004', '3931330', '3932006', '3932005', '3932007', '3931883', '3931885', '3992286', '3899909', '3938254', '3938384', '3992520', '3901637', '3901639', '3901640', '3956831', '3940261', '3992595', '3992594', '3992611', '3958631', '3958627', '3924467', '3895099', '3895093', '3992558', '3992549', '3992395', '3992512', '3965861', '3992393', '3979002', '3901425', '3979004', '3978878', '3892274', '3898348', '3991902', '3991903', '3991904', '3991905', '3955667', '3955670', '3991391', '3991388', '3991389', '3919878', '3919475', '3919477', '3969026', '3969027', '3969028', '3969078', '3969080', '3969081', '3991635', '3992600', '3933192', '3933196', '3933194', '3933190', '3979001', '3991906', '3896273', '3896275', '3896277', '3896272', '3896388', '3955038', '3934253', '3933257', '3919882', '3919879', '3909985', '3909986', '3909989', '3909990', '3909991', '3909984', '3911154', '3911155', '3911157', '3911158', '3911161', '3911162', '3931331', '3913668', '3913662', '3913664', '3986716', '3990728', '3957085', '3957083', '3971265', '3992536', '3992597', '3992598', '3992599', '3992596', '3943662', '3945068', '3991950', '3992589', '3959812', '3957713', '3957718', '3992565', '3992560', '3992562', '3992564', '3992561', '3992563', '3992559', '3943378', '3991907', '3991908', '3894007', '3902234', '3902231', '3917661', '3917665', '3991533', '3931542', '3991678', '3905196', '3992291', '3956832', '3956833', '3941421', '3941425', '3941618', '3945064', '3945063', '3955505', '3955506', '3955507', '3955508', '3933198', '3926057', '3926058', '3926059', '3926056', '3966854', '3966855', '3966857', '3965857', '3965860', '3966086', '3966084', '3966218', '3966088', '3992533', '3992535', '3992534', '3992614', '3992394', '3933801', '3903047', '3939377', '3939380', '3939373', '3939374', '3893434', '3991912', '3991910', '3991909', '3991911', '3992400', '3992399', '3992396', '3992398', '3992435', '3992441', '3992432', '3992440', '3992439', '3917080', '3917084', '3917081', '3967666', '3992572', '3992571', '3992574', '3992573', '3983545', '3983546', '3952152', '3952154', '3980082', '3958374', '3958376', '3956834', '3991951', '3991955', '3991953', '3991952', '3991954', '3924462', '3941937', '3991960', '3991962', '3991961', '3942293', '3966089', '3991966', '3992391', '3991636', '3904561', '3904562', '3904564', '3904567', '3904568', '3901429', '3894809', '3896276', '3926354', '3991679', '3991681', '3991683', '3991685', '3991686', '3991687', '3991688', '3991680', '3991682', '3991684', '3991689', '3991690', '3991691', '3955509', '3991956', '3991957', '3942494', '3942485', '3942492', '3991637', '3909572', '3892272', '3892278', '3892279', '3915053', '3900039', '3900040', '3900041', '3900042', '3911873', '3898345', '3898346', '3898347', '3898349', '3898350', '3898351', '3898353', '3898354', '3898355', '3991913', '3900233', '3900234', '3900231', '3900232', '3900634', '3899302', '3899304', '3899306', '3899308', '3899311', '3899312', '3899313', '3899314', '3899315', '3899316', '3899310', '3992401', '3991390', '3913244', '3913245', '3913246', '3913241', '3913242', '3913243', '3917082', '3917083', '3917085', '3934139', '3936758', '3916861', '3985477', '3957086', '3927593', '3927599', '3937624', '3937623', '3937622', '3937621', '3938140', '3938141', '3938142', '3938144', '3938145', '3947433', '3983548', '3983549', '3983550', '3983551', '3983552', '3983547', '3927300', '3927249', '3927327', '3900375', '3900376', '3900377', '3900379', '3900378', '3900635', '3900636', '3900637', '3900638', '3900639', '3899907', '3962333', '3929605', '3929606', '3929607', '3929608', '3929604', '3929609', '3929611', '3929610', '3929612', '3929613', '3929614', '3908532', '3908539', '3908540', '3908534', '3908535', '3908536', '3908541', '3908533', '3908531', '3908537', '3908538', '3910755', '3910754', '3910757', '3910758', '3910759', '3910760', '3910761', '3910762', '3910763', '3910764', '3910765', '3909060', '3909061', '3909057', '3909059', '3909051', '3909052', '3909053', '3909054', '3909055', '3909056', '3909058', '3929127', '3929128', '3929129', '3929131', '3929133', '3929135', '3929130', '3929132', '3929134', '3929136', '3929137', '3920477', '3920478', '3920479', '3920480', '3920481', '3920484', '3920485', '3920486', '3920482', '3920483', '3920487', '3898928', '3898922', '3898923', '3898925', '3898926', '3898927', '3898929', '3898918', '3898919', '3898920', '3898921', '3958375', '3956835', '3986188', '3986130', '3986532', '3921855', '3992551', '3992537', '3992513', '3930157', '3930163', '3930158', '3930159', '3930160', '3930161', '3930162', '3930164', '3930165', '3930166', '3930167', '3979118', '3979116', '3979117', '3957084', '3959813', '3991914', '3896389', '3955668', '3913667', '3937625', '3956836', '3947562', '3956837', '3948394', '3901030', '3901031', '3901034', '3894003', '3991918', '3991916', '3991915', '3991917', '3902232', '3906052', '3902858', '3914163', '3992443', '3992442', '3916148', '3986704', '3986703', '3967667', '3917662', '3931537', '3905197', '3905198', '3905199', '3905194', '3992271', '3991959', '3921850', '3992514', '3992541', '3965864', '3893070', '3903043', '3893436', '3902859', '3896390', '3951736', '3991964', '3991963', '3991967', '3933810', '3907453', '3911877', '3939372', '3991924', '3991925', '3991927', '3991935', '3991919', '3991921', '3991923', '3991920', '3991933', '3991936', '3991922', '3991926', '3991928', '3991934', '3991900', '3926358', '3992444', '3985473', '3947435', '3967142', '3967136', '3967137', '3941619', '3923328', '3925676', '3991958', '3927592', '3910753', '3904563', '3909574', '3892275', '3893664', '3991937', '3914158', '3967141', '3965265', '3896391', '3915046', '3902499', '3902497', '3991938', '3914162', '3920915', '3992592', '3917663', '3965263', '3906054', '3950722', '3991215', '3893665', '3893666', '3893069', '3893072', '3893075', '3893076', '3907452', '3907450', '3911869', '3893437', '3950725', '3950726', '3950727', '3950723', '3950724', '3918081', '3990729', '3922250', '3907455', '3916149', '3992556', '3945865', '3933804', '3893662', '3952155', '3952531', '3909569', '3928720', '3928715', '3928718', '3928719', '3992603', '3928396', '3928397', '3928401', '3962927', '3962928', '3962924', '3991205', '3992552', '3992554', '3985893', '3915048', '3992605', '3992604', '3991216', '3925129', '3991596', '3991584', '3917942', '3992610', '3992609', '3992608', '3991206', '3940722', '3925137', '3925139', '3917939', '3992606', '3992593', '3991594', '3925133', '3992607', '3982531', '3982532', '3982530', '3992010', '3991997', '3992601', '3991982', '3990916', '3982533', '3991857', '3850605', '3850611', '3850613', '3850606', '3850610', '3850607', '3850612', '3850609', '3850608', '3984254', '3991888', '3990932', '3963105', '3982529', '3991889', '3991890', '3958024', '3991850', '3911156', '3911160', '3913669', '3913666']
- Traceback (most recent call last):
- File "<ipython-input-9-77b4115b18b3>", line 266, in <module>
- m.scrape_match_information()
- File "<ipython-input-9-77b4115b18b3>", line 191, in scrape_match_information
- matches_number = self.matches_number(away_away_team)
- File "<ipython-input-9-77b4115b18b3>", line 80, in matches_number
- return int(re.findall(r'(\d+) match\(es\) in total', match_inf)[0])
- IndexError: list index out of range
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement