SHARE
TWEET

Untitled

a guest Dec 7th, 2019 102 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1.  
  2. import re
  3. import xlwt
  4. from bs4 import BeautifulSoup
  5. from selenium import webdriver
  6. from selenium.webdriver.common.action_chains import ActionChains
  7. import time
  8. from xlutils.copy import copy
  9.  
  10. from xlrd import open_workbook
  11.  
  12.  
  13. class MsportScraper:
  14.     def __init__(self):
  15.         self.driver = webdriver.Firefox()
  16.    
  17.     def get_links_on_matches(self):
  18.         links_list = []
  19.         self.driver.get("http://live.7msport.com/pk_live_en.aspx?view=all&line=no")
  20.         links_soup = BeautifulSoup(self.driver.page_source, 'lxml')
  21.         for link_data in links_soup.find_all('td', class_='data'):
  22.             analyse_js_link = link_data.find_all('a')[0]
  23.             match_id = re.findall(r'\d+', str(analyse_js_link))[0]
  24.             links_list.append(match_id)
  25.         return links_list
  26.    
  27.     def scroll_shim(self, object):
  28.         passed_in_driver = self.driver
  29.         x = object.location['x']
  30.         y = object.location['y']
  31.         scroll_by_coord = 'window.scrollTo(%s,%s);' % (
  32.             x,
  33.             y
  34.         )
  35.         scroll_nav_out_of_way = 'window.scrollBy(0, -120);'
  36.         passed_in_driver.execute_script(scroll_by_coord)
  37.         passed_in_driver.execute_script(scroll_nav_out_of_way)
  38.    
  39.     @staticmethod
  40.     def write_results_to_excel(results_list, row, workbook_path=None):
  41.         workbook_created = False
  42.         if workbook_path is None:
  43.             timestamp = time.time()
  44.             book = xlwt.Workbook(encoding="utf-8")
  45.             book.add_sheet("Sheet 1")
  46.             workbook_path = f"results_{timestamp}.xls"
  47.             book.save(workbook_path)
  48.             workbook_created = True
  49.         rb = open_workbook(workbook_path, formatting_info=True)
  50.         wb = copy(rb)  # a writable copy (I can't read values out of this, only write to it)
  51.         w_sheet = wb.get_sheet(0)
  52.         for i in range(len(results_list)):
  53.             w_sheet.write(row, i, results_list[i])
  54.         wb.save(workbook_path)
  55.         if workbook_created:
  56.             return workbook_path
  57.    
  58.     def find_head_to_heads(self, analyse_data):
  59.         over_under = re.findall(r'Totally, (\d+) game\(s\) over, (\d+)', analyse_data)
  60.         try:
  61.             over_under_ratio = int(over_under[0][0]) / (int(over_under[0][0]) + int(over_under[0][1]))
  62.         except ZeroDivisionError:
  63.             over_under_ratio = 1.0
  64.         except IndexError:
  65.             over_under_ratio = None
  66.         over_under2 = re.findall(r'(\d+) game\(s\) half-game over, (\d+)', analyse_data)
  67.         try:
  68.             over_under2_ratio = int(over_under2[0][0]) / (int(over_under2[0][0]) + int(over_under2[0][1]))
  69.         except ZeroDivisionError:
  70.             over_under2_ratio = 1.0
  71.         except IndexError:
  72.             over_under2_ratio = None
  73.         head_to_heads = dict()
  74.         head_to_heads['over_under0.75'] = over_under2_ratio
  75.         head_to_heads['over_under2.5'] = over_under_ratio
  76.         return head_to_heads
  77.    
  78.     @staticmethod
  79.     def matches_number(soup):
  80.         match_inf = soup.find('div', class_='ana_count').get_text()
  81.         return int(re.findall(r'(\d+) match\(es\) in total', match_inf)[0])
  82.    
  83.     def scrape_match_information(self):
  84.         match_ids = self.get_links_on_matches()
  85.         soup_main_page = BeautifulSoup(self.driver.page_source, 'lxml')
  86.         workbook_path = None
  87.         row = 0
  88.         print(match_ids)
  89.         for match_id in match_ids:
  90.             match = soup_main_page.find('tr', id=f'bh{match_id}')
  91.             league = match.find('td', class_='match').get_text()
  92.             match_time = match.find('td', class_='time').get_text()
  93.             match_name = match.find('td', class_='home').get_text() + " vs " + match.find('td',
  94.                                                                                           class_='away').get_text()
  95.             try:
  96.                 handicap = match.find('a', class_='rq').get_text()
  97.             except AttributeError:
  98.                 handicap = None
  99.             self.driver.get(f"http://analyse.7msport.com/{match_id}/index.shtml")
  100.             soup = BeautifulSoup(self.driver.page_source, 'lxml')
  101.            
  102.             total_home_team = soup.find('div', id='divTeamHistoryA0')
  103.            
  104.             try:
  105.                 a1_elem = self.driver.find_element_by_id('ddTabA1')
  106.                 ActionChains(self.driver).move_to_element(a1_elem).perform()
  107.                 soup = BeautifulSoup(self.driver.page_source, 'lxml')
  108.                 home_home_team = soup.find('div', id='divTeamHistoryA1')
  109.             except Exception:
  110.                 home_home_team = None
  111.             b0_elem = self.driver.find_element_by_id('ddTabB0')
  112.             self.scroll_shim(b0_elem)
  113.             actions = ActionChains(self.driver).move_to_element(
  114.                 self.driver.find_element_by_id('divTeamHistoryB0'))
  115.             actions.perform()
  116.             soup = BeautifulSoup(self.driver.page_source, 'lxml')
  117.             total_away_team = soup.find('div', id='divTeamHistoryB0')
  118.             try:
  119.                 b1_elem = self.driver.find_element_by_id('ddTabB1')
  120.                 ActionChains(self.driver).move_to_element(b1_elem).perform()
  121.                 soup = BeautifulSoup(self.driver.page_source, 'lxml')
  122.                 away_away_team = soup.find('div', id='divTeamHistoryB1')
  123.             except Exception:
  124.                 away_away_team = None
  125.             soup = BeautifulSoup(self.driver.page_source, 'lxml')
  126.             home = soup.find(id='dtTeamHistoryTitleA').get_text().split(" - ")[0]
  127.             away = soup.find(id='dtTeamHistoryTitleB').get_text().split(" - ")[0]
  128.             time.sleep(4)
  129.             if not home_home_team is None:
  130.                 total_matches = self.matches_number(home_home_team)
  131.                 home_ground_goal_ratio = 0
  132.                 home_ground_goal_conceded_ratio = 0
  133.                 for match_information in home_home_team.find_all('tr', {'class': ['sjt1', 'sjt2']}):
  134.                     count = 0
  135.                     for information in match_information.find_all('td'):
  136.                         count += 1
  137.                         if count == 4:
  138.                             home_ground_goal_ratio += int(information.get_text().split('-')[0])
  139.                             home_ground_goal_conceded_ratio += int(information.get_text().split('-')[1])
  140.                 home_ground_goal_ratio /= total_matches
  141.                 home_ground_goal_conceded_ratio /= total_matches
  142.            
  143.             else:
  144.                 home_ground_goal_ratio = None
  145.                 home_ground_goal_conceded_ratio = None
  146.             total_goal_conceded_ratio = 0
  147.             total_goal_ratio = 0
  148.             for match_information in total_home_team.find_all('tr', {'class': ['sjt1', 'sjt2']}):
  149.                 count = 0
  150.                 home_first_flag = False
  151.                
  152.                 for information in match_information.find_all('td'):
  153.                     count += 1
  154.                     if home in information.get_text():
  155.                         home_first_flag = True
  156.                     else:
  157.                         pass
  158.                     if count == 4 and home_first_flag:
  159.                         total_goal_ratio += int(information.get_text().split('-')[0])
  160.                         total_goal_conceded_ratio += int(information.get_text().split('-')[1])
  161.                     elif count == 4:
  162.                         total_goal_ratio += int(information.get_text().split('-')[1])
  163.                         total_goal_conceded_ratio += int(information.get_text().split('-')[0])
  164.             total_matches = self.matches_number(total_home_team)
  165.             total_goal_ratio /= total_matches
  166.             total_goal_conceded_ratio /= total_matches
  167.             away_total_goal_ratio = 0
  168.             away_total_goal_conceded_ratio = 0
  169.             for match_information in total_away_team.find_all('tr', {'class': ['sjt3', 'sjt4']}):
  170.                 away_first_flag = False
  171.                 count = 0
  172.                
  173.                 for information in match_information.find_all('td'):
  174.                     count += 1
  175.                     if away in information.get_text():
  176.                         away_first_flag = True
  177.                     else:
  178.                         pass
  179.                     if count == 4 and away_first_flag:
  180.                         away_total_goal_ratio += int(information.get_text().split('-')[0])
  181.                         away_total_goal_conceded_ratio += int(information.get_text().split('-')[1])
  182.                     elif count == 4:
  183.                         away_total_goal_conceded_ratio += int(information.get_text().split('-')[0])
  184.                        
  185.                         away_total_goal_ratio += int(information.get_text().split('-')[1])
  186.             total_matches = self.matches_number(total_away_team)
  187.             away_total_goal_ratio /= total_matches
  188.             away_total_goal_conceded_ratio /= total_matches
  189.             if not away_away_team is None:
  190.                 away_ground_goal_conceded_ratio = 0
  191.                 away_ground_goal_ratio = 0
  192.                 matches_number = self.matches_number(away_away_team)
  193.                 for match_information in away_away_team.find_all('tr', {'class': ['sjt3', 'sjt4']}):
  194.                     count = 0
  195.                     for information in match_information.find_all('td'):
  196.                         count += 1
  197.                         if count == 4:
  198.                             away_ground_goal_ratio += int(information.get_text().split('-')[1])
  199.                             away_ground_goal_conceded_ratio += int(information.get_text().split('-')[0])
  200.                 away_ground_goal_ratio /= matches_number
  201.                 away_ground_goal_conceded_ratio /= matches_number
  202.             else:
  203.                 away_ground_goal_ratio = None
  204.                 away_ground_goal_conceded_ratio = None
  205.            
  206.             p_confrontation = soup.find('div', id='jfwj_body').find('p', class_='ana_count').get_text()
  207.             if p_confrontation == "":
  208.                 results = {'over_under2.5': None, "over_under0.75": None}
  209.                 results_home_ground = {'over_under2.5': None, "over_under0.75": None}
  210.            
  211.             else:
  212.                 results = self.find_head_to_heads(p_confrontation)
  213.                 self.driver.find_element_by_id('WJchk').click()
  214.                 time.sleep(1)
  215.                 soup = BeautifulSoup(self.driver.page_source, 'lxml')
  216.                 p_confrontation = soup.find('div', id='jfwj_body').find('p', class_='ana_count').get_text()
  217.                 results_home_ground = self.find_head_to_heads(p_confrontation)
  218.             p_confrontation = total_home_team.find('div', class_='ana_count').get_text()
  219.             total_home_results = self.find_head_to_heads(p_confrontation)
  220.            
  221.             if not home_home_team is None:
  222.                 p_confrontation = home_home_team.find('div', class_='ana_count').get_text()
  223.                 home_home_results = self.find_head_to_heads(p_confrontation)
  224.             else:
  225.                 home_home_results = {'over_under2.5': None, "over_under0.75": None}
  226.             p_confrontation = total_away_team.find('div', class_='ana_count').get_text()
  227.             total_away_results = self.find_head_to_heads(p_confrontation)
  228.             if not away_away_team is None:
  229.                 p_confrontation = away_away_team.find('div', class_='ana_count').get_text()
  230.                 away_away_results = self.find_head_to_heads(p_confrontation)
  231.             else:
  232.                 away_away_results = {'over_under2.5': None, "over_under0.75": None}
  233.             results_list = [league,
  234.                             match_time,
  235.                             match_name,
  236.                             handicap,
  237.                             total_goal_ratio,
  238.                             home_ground_goal_ratio,
  239.                             total_goal_conceded_ratio,
  240.                             home_ground_goal_conceded_ratio,
  241.                             away_total_goal_ratio,
  242.                             away_ground_goal_ratio,
  243.                             away_total_goal_conceded_ratio,
  244.                             away_ground_goal_conceded_ratio,
  245.                             results['over_under2.5'],
  246.                             results['over_under0.75'],
  247.                             results_home_ground['over_under2.5'],
  248.                             results_home_ground['over_under0.75'],
  249.                             total_home_results['over_under2.5'],
  250.                             total_home_results['over_under0.75'],
  251.                             home_home_results['over_under2.5'],
  252.                             home_home_results['over_under0.75'],
  253.                             total_home_results['over_under2.5'],
  254.                             total_home_results['over_under0.75'],
  255.                             total_away_results['over_under2.5'],
  256.                             total_away_results['over_under0.75'],
  257.                             away_away_results['over_under2.5'],
  258.                             away_away_results['over_under0.75']]
  259.             if workbook_path is None:
  260.                 workbook_path = self.write_results_to_excel(results_list, row)
  261.             else:
  262.                 self.write_results_to_excel(results_list, row, workbook_path)
  263.             row += 1
  264.  
  265.  
  266. m = MsportScraper()
  267. m.scrape_match_information()
  268. ['3992506', '3992285', '3916866', '3991546', '3992538', '3992557', '3991858', '3963106', '3884508', '3991536', '3992510', '3966216', '3990988', '3992586', '3992527', '3924463', '3924464', '3924466', '3992542', '3943380', '3992509', '3992507', '3992519', '3992590', '3991891', '3991895', '3991892', '3992511', '3936743', '3934252', '3934251', '3933256', '3933260', '3931326', '3931327', '3932001', '3932003', '3932004', '3931881', '3931879', '3931880', '3931882', '3916863', '3992587', '3992588', '3962330', '3992526', '3992523', '3992524', '3992525', '3992529', '3992531', '3992528', '3992530', '3901641', '3924461', '3992543', '3992602', '3991897', '3991893', '3965862', '3966217', '3966087', '3966215', '3960158', '3960503', '3960504', '3960501', '3958996', '3992290', '3992518', '3966213', '3966214', '3894005', '3936742', '3931328', '3916867', '3927597', '3937626', '3931541', '3992522', '3992521', '3945065', '3986477', '3941939', '3941934', '3941936', '3941935', '3941938', '3932435', '3932436', '3932437', '3922638', '3992516', '3992540', '3966852', '3992532', '3984255', '3884507', '3936744', '3992591', '3992555', '3992585', '3992545', '3992547', '3992548', '3992544', '3992546', '3992550', '3992515', '3992517', '3991896', '3992539', '3991894', '3991851', '3909576', '3904921', '3904922', '3904919', '3926362', '3933000', '3934140', '3934141', '3934142', '3934138', '3939676', '3936755', '3919876', '3919881', '3942004', '3931330', '3932006', '3932005', '3932007', '3931883', '3931885', '3992286', '3899909', '3938254', '3938384', '3992520', '3901637', '3901639', '3901640', '3956831', '3940261', '3992595', '3992594', '3992611', '3958631', '3958627', '3924467', '3895099', '3895093', '3992558', '3992549', '3992395', '3992512', '3965861', '3992393', '3979002', '3901425', '3979004', '3978878', '3892274', '3898348', '3991902', '3991903', '3991904', '3991905', '3955667', '3955670', '3991391', '3991388', '3991389', '3919878', '3919475', '3919477', '3969026', '3969027', '3969028', '3969078', '3969080', '3969081', '3991635', '3992600', '3933192', '3933196', '3933194', '3933190', '3979001', '3991906', '3896273', '3896275', '3896277', '3896272', '3896388', '3955038', '3934253', '3933257', '3919882', '3919879', '3909985', '3909986', '3909989', '3909990', '3909991', '3909984', '3911154', '3911155', '3911157', '3911158', '3911161', '3911162', '3931331', '3913668', '3913662', '3913664', '3986716', '3990728', '3957085', '3957083', '3971265', '3992536', '3992597', '3992598', '3992599', '3992596', '3943662', '3945068', '3991950', '3992589', '3959812', '3957713', '3957718', '3992565', '3992560', '3992562', '3992564', '3992561', '3992563', '3992559', '3943378', '3991907', '3991908', '3894007', '3902234', '3902231', '3917661', '3917665', '3991533', '3931542', '3991678', '3905196', '3992291', '3956832', '3956833', '3941421', '3941425', '3941618', '3945064', '3945063', '3955505', '3955506', '3955507', '3955508', '3933198', '3926057', '3926058', '3926059', '3926056', '3966854', '3966855', '3966857', '3965857', '3965860', '3966086', '3966084', '3966218', '3966088', '3992533', '3992535', '3992534', '3992614', '3992394', '3933801', '3903047', '3939377', '3939380', '3939373', '3939374', '3893434', '3991912', '3991910', '3991909', '3991911', '3992400', '3992399', '3992396', '3992398', '3992435', '3992441', '3992432', '3992440', '3992439', '3917080', '3917084', '3917081', '3967666', '3992572', '3992571', '3992574', '3992573', '3983545', '3983546', '3952152', '3952154', '3980082', '3958374', '3958376', '3956834', '3991951', '3991955', '3991953', '3991952', '3991954', '3924462', '3941937', '3991960', '3991962', '3991961', '3942293', '3966089', '3991966', '3992391', '3991636', '3904561', '3904562', '3904564', '3904567', '3904568', '3901429', '3894809', '3896276', '3926354', '3991679', '3991681', '3991683', '3991685', '3991686', '3991687', '3991688', '3991680', '3991682', '3991684', '3991689', '3991690', '3991691', '3955509', '3991956', '3991957', '3942494', '3942485', '3942492', '3991637', '3909572', '3892272', '3892278', '3892279', '3915053', '3900039', '3900040', '3900041', '3900042', '3911873', '3898345', '3898346', '3898347', '3898349', '3898350', '3898351', '3898353', '3898354', '3898355', '3991913', '3900233', '3900234', '3900231', '3900232', '3900634', '3899302', '3899304', '3899306', '3899308', '3899311', '3899312', '3899313', '3899314', '3899315', '3899316', '3899310', '3992401', '3991390', '3913244', '3913245', '3913246', '3913241', '3913242', '3913243', '3917082', '3917083', '3917085', '3934139', '3936758', '3916861', '3985477', '3957086', '3927593', '3927599', '3937624', '3937623', '3937622', '3937621', '3938140', '3938141', '3938142', '3938144', '3938145', '3947433', '3983548', '3983549', '3983550', '3983551', '3983552', '3983547', '3927300', '3927249', '3927327', '3900375', '3900376', '3900377', '3900379', '3900378', '3900635', '3900636', '3900637', '3900638', '3900639', '3899907', '3962333', '3929605', '3929606', '3929607', '3929608', '3929604', '3929609', '3929611', '3929610', '3929612', '3929613', '3929614', '3908532', '3908539', '3908540', '3908534', '3908535', '3908536', '3908541', '3908533', '3908531', '3908537', '3908538', '3910755', '3910754', '3910757', '3910758', '3910759', '3910760', '3910761', '3910762', '3910763', '3910764', '3910765', '3909060', '3909061', '3909057', '3909059', '3909051', '3909052', '3909053', '3909054', '3909055', '3909056', '3909058', '3929127', '3929128', '3929129', '3929131', '3929133', '3929135', '3929130', '3929132', '3929134', '3929136', '3929137', '3920477', '3920478', '3920479', '3920480', '3920481', '3920484', '3920485', '3920486', '3920482', '3920483', '3920487', '3898928', '3898922', '3898923', '3898925', '3898926', '3898927', '3898929', '3898918', '3898919', '3898920', '3898921', '3958375', '3956835', '3986188', '3986130', '3986532', '3921855', '3992551', '3992537', '3992513', '3930157', '3930163', '3930158', '3930159', '3930160', '3930161', '3930162', '3930164', '3930165', '3930166', '3930167', '3979118', '3979116', '3979117', '3957084', '3959813', '3991914', '3896389', '3955668', '3913667', '3937625', '3956836', '3947562', '3956837', '3948394', '3901030', '3901031', '3901034', '3894003', '3991918', '3991916', '3991915', '3991917', '3902232', '3906052', '3902858', '3914163', '3992443', '3992442', '3916148', '3986704', '3986703', '3967667', '3917662', '3931537', '3905197', '3905198', '3905199', '3905194', '3992271', '3991959', '3921850', '3992514', '3992541', '3965864', '3893070', '3903043', '3893436', '3902859', '3896390', '3951736', '3991964', '3991963', '3991967', '3933810', '3907453', '3911877', '3939372', '3991924', '3991925', '3991927', '3991935', '3991919', '3991921', '3991923', '3991920', '3991933', '3991936', '3991922', '3991926', '3991928', '3991934', '3991900', '3926358', '3992444', '3985473', '3947435', '3967142', '3967136', '3967137', '3941619', '3923328', '3925676', '3991958', '3927592', '3910753', '3904563', '3909574', '3892275', '3893664', '3991937', '3914158', '3967141', '3965265', '3896391', '3915046', '3902499', '3902497', '3991938', '3914162', '3920915', '3992592', '3917663', '3965263', '3906054', '3950722', '3991215', '3893665', '3893666', '3893069', '3893072', '3893075', '3893076', '3907452', '3907450', '3911869', '3893437', '3950725', '3950726', '3950727', '3950723', '3950724', '3918081', '3990729', '3922250', '3907455', '3916149', '3992556', '3945865', '3933804', '3893662', '3952155', '3952531', '3909569', '3928720', '3928715', '3928718', '3928719', '3992603', '3928396', '3928397', '3928401', '3962927', '3962928', '3962924', '3991205', '3992552', '3992554', '3985893', '3915048', '3992605', '3992604', '3991216', '3925129', '3991596', '3991584', '3917942', '3992610', '3992609', '3992608', '3991206', '3940722', '3925137', '3925139', '3917939', '3992606', '3992593', '3991594', '3925133', '3992607', '3982531', '3982532', '3982530', '3992010', '3991997', '3992601', '3991982', '3990916', '3982533', '3991857', '3850605', '3850611', '3850613', '3850606', '3850610', '3850607', '3850612', '3850609', '3850608', '3984254', '3991888', '3990932', '3963105', '3982529', '3991889', '3991890', '3958024', '3991850', '3911156', '3911160', '3913669', '3913666']
  269. Traceback (most recent call last):
  270.  
  271.   File "<ipython-input-9-77b4115b18b3>", line 266, in <module>
  272.     m.scrape_match_information()
  273.  
  274.   File "<ipython-input-9-77b4115b18b3>", line 191, in scrape_match_information
  275.     matches_number = self.matches_number(away_away_team)
  276.  
  277.   File "<ipython-input-9-77b4115b18b3>", line 80, in matches_number
  278.     return int(re.findall(r'(\d+) match\(es\) in total', match_inf)[0])
  279.  
  280. IndexError: list index out of range
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top