Advertisement
Guest User

Untitled

a guest
Dec 7th, 2019
132
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 21.71 KB | None | 0 0
  1.  
  2. import re
  3. import xlwt
  4. from bs4 import BeautifulSoup
  5. from selenium import webdriver
  6. from selenium.webdriver.common.action_chains import ActionChains
  7. import time
  8. from xlutils.copy import copy
  9.  
  10. from xlrd import open_workbook
  11.  
  12.  
  13. class MsportScraper:
  14. def __init__(self):
  15. self.driver = webdriver.Firefox()
  16.  
  17. def get_links_on_matches(self):
  18. links_list = []
  19. self.driver.get("http://live.7msport.com/pk_live_en.aspx?view=all&line=no")
  20. links_soup = BeautifulSoup(self.driver.page_source, 'lxml')
  21. for link_data in links_soup.find_all('td', class_='data'):
  22. analyse_js_link = link_data.find_all('a')[0]
  23. match_id = re.findall(r'\d+', str(analyse_js_link))[0]
  24. links_list.append(match_id)
  25. return links_list
  26.  
  27. def scroll_shim(self, object):
  28. passed_in_driver = self.driver
  29. x = object.location['x']
  30. y = object.location['y']
  31. scroll_by_coord = 'window.scrollTo(%s,%s);' % (
  32. x,
  33. y
  34. )
  35. scroll_nav_out_of_way = 'window.scrollBy(0, -120);'
  36. passed_in_driver.execute_script(scroll_by_coord)
  37. passed_in_driver.execute_script(scroll_nav_out_of_way)
  38.  
  39. @staticmethod
  40. def write_results_to_excel(results_list, row, workbook_path=None):
  41. workbook_created = False
  42. if workbook_path is None:
  43. timestamp = time.time()
  44. book = xlwt.Workbook(encoding="utf-8")
  45. book.add_sheet("Sheet 1")
  46. workbook_path = f"results_{timestamp}.xls"
  47. book.save(workbook_path)
  48. workbook_created = True
  49. rb = open_workbook(workbook_path, formatting_info=True)
  50. wb = copy(rb) # a writable copy (I can't read values out of this, only write to it)
  51. w_sheet = wb.get_sheet(0)
  52. for i in range(len(results_list)):
  53. w_sheet.write(row, i, results_list[i])
  54. wb.save(workbook_path)
  55. if workbook_created:
  56. return workbook_path
  57.  
  58. def find_head_to_heads(self, analyse_data):
  59. over_under = re.findall(r'Totally, (\d+) game\(s\) over, (\d+)', analyse_data)
  60. try:
  61. over_under_ratio = int(over_under[0][0]) / (int(over_under[0][0]) + int(over_under[0][1]))
  62. except ZeroDivisionError:
  63. over_under_ratio = 1.0
  64. except IndexError:
  65. over_under_ratio = None
  66. over_under2 = re.findall(r'(\d+) game\(s\) half-game over, (\d+)', analyse_data)
  67. try:
  68. over_under2_ratio = int(over_under2[0][0]) / (int(over_under2[0][0]) + int(over_under2[0][1]))
  69. except ZeroDivisionError:
  70. over_under2_ratio = 1.0
  71. except IndexError:
  72. over_under2_ratio = None
  73. head_to_heads = dict()
  74. head_to_heads['over_under0.75'] = over_under2_ratio
  75. head_to_heads['over_under2.5'] = over_under_ratio
  76. return head_to_heads
  77.  
  78. @staticmethod
  79. def matches_number(soup):
  80. match_inf = soup.find('div', class_='ana_count').get_text()
  81. return int(re.findall(r'(\d+) match\(es\) in total', match_inf)[0])
  82.  
  83. def scrape_match_information(self):
  84. match_ids = self.get_links_on_matches()
  85. soup_main_page = BeautifulSoup(self.driver.page_source, 'lxml')
  86. workbook_path = None
  87. row = 0
  88. print(match_ids)
  89. for match_id in match_ids:
  90. match = soup_main_page.find('tr', id=f'bh{match_id}')
  91. league = match.find('td', class_='match').get_text()
  92. match_time = match.find('td', class_='time').get_text()
  93. match_name = match.find('td', class_='home').get_text() + " vs " + match.find('td',
  94. class_='away').get_text()
  95. try:
  96. handicap = match.find('a', class_='rq').get_text()
  97. except AttributeError:
  98. handicap = None
  99. self.driver.get(f"http://analyse.7msport.com/{match_id}/index.shtml")
  100. soup = BeautifulSoup(self.driver.page_source, 'lxml')
  101.  
  102. total_home_team = soup.find('div', id='divTeamHistoryA0')
  103.  
  104. try:
  105. a1_elem = self.driver.find_element_by_id('ddTabA1')
  106. ActionChains(self.driver).move_to_element(a1_elem).perform()
  107. soup = BeautifulSoup(self.driver.page_source, 'lxml')
  108. home_home_team = soup.find('div', id='divTeamHistoryA1')
  109. except Exception:
  110. home_home_team = None
  111. b0_elem = self.driver.find_element_by_id('ddTabB0')
  112. self.scroll_shim(b0_elem)
  113. actions = ActionChains(self.driver).move_to_element(
  114. self.driver.find_element_by_id('divTeamHistoryB0'))
  115. actions.perform()
  116. soup = BeautifulSoup(self.driver.page_source, 'lxml')
  117. total_away_team = soup.find('div', id='divTeamHistoryB0')
  118. try:
  119. b1_elem = self.driver.find_element_by_id('ddTabB1')
  120. ActionChains(self.driver).move_to_element(b1_elem).perform()
  121. soup = BeautifulSoup(self.driver.page_source, 'lxml')
  122. away_away_team = soup.find('div', id='divTeamHistoryB1')
  123. except Exception:
  124. away_away_team = None
  125. soup = BeautifulSoup(self.driver.page_source, 'lxml')
  126. home = soup.find(id='dtTeamHistoryTitleA').get_text().split(" - ")[0]
  127. away = soup.find(id='dtTeamHistoryTitleB').get_text().split(" - ")[0]
  128. time.sleep(4)
  129. if not home_home_team is None:
  130. total_matches = self.matches_number(home_home_team)
  131. home_ground_goal_ratio = 0
  132. home_ground_goal_conceded_ratio = 0
  133. for match_information in home_home_team.find_all('tr', {'class': ['sjt1', 'sjt2']}):
  134. count = 0
  135. for information in match_information.find_all('td'):
  136. count += 1
  137. if count == 4:
  138. home_ground_goal_ratio += int(information.get_text().split('-')[0])
  139. home_ground_goal_conceded_ratio += int(information.get_text().split('-')[1])
  140. home_ground_goal_ratio /= total_matches
  141. home_ground_goal_conceded_ratio /= total_matches
  142.  
  143. else:
  144. home_ground_goal_ratio = None
  145. home_ground_goal_conceded_ratio = None
  146. total_goal_conceded_ratio = 0
  147. total_goal_ratio = 0
  148. for match_information in total_home_team.find_all('tr', {'class': ['sjt1', 'sjt2']}):
  149. count = 0
  150. home_first_flag = False
  151.  
  152. for information in match_information.find_all('td'):
  153. count += 1
  154. if home in information.get_text():
  155. home_first_flag = True
  156. else:
  157. pass
  158. if count == 4 and home_first_flag:
  159. total_goal_ratio += int(information.get_text().split('-')[0])
  160. total_goal_conceded_ratio += int(information.get_text().split('-')[1])
  161. elif count == 4:
  162. total_goal_ratio += int(information.get_text().split('-')[1])
  163. total_goal_conceded_ratio += int(information.get_text().split('-')[0])
  164. total_matches = self.matches_number(total_home_team)
  165. total_goal_ratio /= total_matches
  166. total_goal_conceded_ratio /= total_matches
  167. away_total_goal_ratio = 0
  168. away_total_goal_conceded_ratio = 0
  169. for match_information in total_away_team.find_all('tr', {'class': ['sjt3', 'sjt4']}):
  170. away_first_flag = False
  171. count = 0
  172.  
  173. for information in match_information.find_all('td'):
  174. count += 1
  175. if away in information.get_text():
  176. away_first_flag = True
  177. else:
  178. pass
  179. if count == 4 and away_first_flag:
  180. away_total_goal_ratio += int(information.get_text().split('-')[0])
  181. away_total_goal_conceded_ratio += int(information.get_text().split('-')[1])
  182. elif count == 4:
  183. away_total_goal_conceded_ratio += int(information.get_text().split('-')[0])
  184.  
  185. away_total_goal_ratio += int(information.get_text().split('-')[1])
  186. total_matches = self.matches_number(total_away_team)
  187. away_total_goal_ratio /= total_matches
  188. away_total_goal_conceded_ratio /= total_matches
  189. if not away_away_team is None:
  190. away_ground_goal_conceded_ratio = 0
  191. away_ground_goal_ratio = 0
  192. matches_number = self.matches_number(away_away_team)
  193. for match_information in away_away_team.find_all('tr', {'class': ['sjt3', 'sjt4']}):
  194. count = 0
  195. for information in match_information.find_all('td'):
  196. count += 1
  197. if count == 4:
  198. away_ground_goal_ratio += int(information.get_text().split('-')[1])
  199. away_ground_goal_conceded_ratio += int(information.get_text().split('-')[0])
  200. away_ground_goal_ratio /= matches_number
  201. away_ground_goal_conceded_ratio /= matches_number
  202. else:
  203. away_ground_goal_ratio = None
  204. away_ground_goal_conceded_ratio = None
  205.  
  206. p_confrontation = soup.find('div', id='jfwj_body').find('p', class_='ana_count').get_text()
  207. if p_confrontation == "":
  208. results = {'over_under2.5': None, "over_under0.75": None}
  209. results_home_ground = {'over_under2.5': None, "over_under0.75": None}
  210.  
  211. else:
  212. results = self.find_head_to_heads(p_confrontation)
  213. self.driver.find_element_by_id('WJchk').click()
  214. time.sleep(1)
  215. soup = BeautifulSoup(self.driver.page_source, 'lxml')
  216. p_confrontation = soup.find('div', id='jfwj_body').find('p', class_='ana_count').get_text()
  217. results_home_ground = self.find_head_to_heads(p_confrontation)
  218. p_confrontation = total_home_team.find('div', class_='ana_count').get_text()
  219. total_home_results = self.find_head_to_heads(p_confrontation)
  220.  
  221. if not home_home_team is None:
  222. p_confrontation = home_home_team.find('div', class_='ana_count').get_text()
  223. home_home_results = self.find_head_to_heads(p_confrontation)
  224. else:
  225. home_home_results = {'over_under2.5': None, "over_under0.75": None}
  226. p_confrontation = total_away_team.find('div', class_='ana_count').get_text()
  227. total_away_results = self.find_head_to_heads(p_confrontation)
  228. if not away_away_team is None:
  229. p_confrontation = away_away_team.find('div', class_='ana_count').get_text()
  230. away_away_results = self.find_head_to_heads(p_confrontation)
  231. else:
  232. away_away_results = {'over_under2.5': None, "over_under0.75": None}
  233. results_list = [league,
  234. match_time,
  235. match_name,
  236. handicap,
  237. total_goal_ratio,
  238. home_ground_goal_ratio,
  239. total_goal_conceded_ratio,
  240. home_ground_goal_conceded_ratio,
  241. away_total_goal_ratio,
  242. away_ground_goal_ratio,
  243. away_total_goal_conceded_ratio,
  244. away_ground_goal_conceded_ratio,
  245. results['over_under2.5'],
  246. results['over_under0.75'],
  247. results_home_ground['over_under2.5'],
  248. results_home_ground['over_under0.75'],
  249. total_home_results['over_under2.5'],
  250. total_home_results['over_under0.75'],
  251. home_home_results['over_under2.5'],
  252. home_home_results['over_under0.75'],
  253. total_home_results['over_under2.5'],
  254. total_home_results['over_under0.75'],
  255. total_away_results['over_under2.5'],
  256. total_away_results['over_under0.75'],
  257. away_away_results['over_under2.5'],
  258. away_away_results['over_under0.75']]
  259. if workbook_path is None:
  260. workbook_path = self.write_results_to_excel(results_list, row)
  261. else:
  262. self.write_results_to_excel(results_list, row, workbook_path)
  263. row += 1
  264.  
  265.  
  266. m = MsportScraper()
  267. m.scrape_match_information()
  268. ['3992506', '3992285', '3916866', '3991546', '3992538', '3992557', '3991858', '3963106', '3884508', '3991536', '3992510', '3966216', '3990988', '3992586', '3992527', '3924463', '3924464', '3924466', '3992542', '3943380', '3992509', '3992507', '3992519', '3992590', '3991891', '3991895', '3991892', '3992511', '3936743', '3934252', '3934251', '3933256', '3933260', '3931326', '3931327', '3932001', '3932003', '3932004', '3931881', '3931879', '3931880', '3931882', '3916863', '3992587', '3992588', '3962330', '3992526', '3992523', '3992524', '3992525', '3992529', '3992531', '3992528', '3992530', '3901641', '3924461', '3992543', '3992602', '3991897', '3991893', '3965862', '3966217', '3966087', '3966215', '3960158', '3960503', '3960504', '3960501', '3958996', '3992290', '3992518', '3966213', '3966214', '3894005', '3936742', '3931328', '3916867', '3927597', '3937626', '3931541', '3992522', '3992521', '3945065', '3986477', '3941939', '3941934', '3941936', '3941935', '3941938', '3932435', '3932436', '3932437', '3922638', '3992516', '3992540', '3966852', '3992532', '3984255', '3884507', '3936744', '3992591', '3992555', '3992585', '3992545', '3992547', '3992548', '3992544', '3992546', '3992550', '3992515', '3992517', '3991896', '3992539', '3991894', '3991851', '3909576', '3904921', '3904922', '3904919', '3926362', '3933000', '3934140', '3934141', '3934142', '3934138', '3939676', '3936755', '3919876', '3919881', '3942004', '3931330', '3932006', '3932005', '3932007', '3931883', '3931885', '3992286', '3899909', '3938254', '3938384', '3992520', '3901637', '3901639', '3901640', '3956831', '3940261', '3992595', '3992594', '3992611', '3958631', '3958627', '3924467', '3895099', '3895093', '3992558', '3992549', '3992395', '3992512', '3965861', '3992393', '3979002', '3901425', '3979004', '3978878', '3892274', '3898348', '3991902', '3991903', '3991904', '3991905', '3955667', '3955670', '3991391', '3991388', '3991389', '3919878', '3919475', '3919477', '3969026', '3969027', '3969028', '3969078', '3969080', '3969081', '3991635', '3992600', '3933192', '3933196', '3933194', '3933190', '3979001', '3991906', '3896273', '3896275', '3896277', '3896272', '3896388', '3955038', '3934253', '3933257', '3919882', '3919879', '3909985', '3909986', '3909989', '3909990', '3909991', '3909984', '3911154', '3911155', '3911157', '3911158', '3911161', '3911162', '3931331', '3913668', '3913662', '3913664', '3986716', '3990728', '3957085', '3957083', '3971265', '3992536', '3992597', '3992598', '3992599', '3992596', '3943662', '3945068', '3991950', '3992589', '3959812', '3957713', '3957718', '3992565', '3992560', '3992562', '3992564', '3992561', '3992563', '3992559', '3943378', '3991907', '3991908', '3894007', '3902234', '3902231', '3917661', '3917665', '3991533', '3931542', '3991678', '3905196', '3992291', '3956832', '3956833', '3941421', '3941425', '3941618', '3945064', '3945063', '3955505', '3955506', '3955507', '3955508', '3933198', '3926057', '3926058', '3926059', '3926056', '3966854', '3966855', '3966857', '3965857', '3965860', '3966086', '3966084', '3966218', '3966088', '3992533', '3992535', '3992534', '3992614', '3992394', '3933801', '3903047', '3939377', '3939380', '3939373', '3939374', '3893434', '3991912', '3991910', '3991909', '3991911', '3992400', '3992399', '3992396', '3992398', '3992435', '3992441', '3992432', '3992440', '3992439', '3917080', '3917084', '3917081', '3967666', '3992572', '3992571', '3992574', '3992573', '3983545', '3983546', '3952152', '3952154', '3980082', '3958374', '3958376', '3956834', '3991951', '3991955', '3991953', '3991952', '3991954', '3924462', '3941937', '3991960', '3991962', '3991961', '3942293', '3966089', '3991966', '3992391', '3991636', '3904561', '3904562', '3904564', '3904567', '3904568', '3901429', '3894809', '3896276', '3926354', '3991679', '3991681', '3991683', '3991685', '3991686', '3991687', '3991688', '3991680', '3991682', '3991684', '3991689', '3991690', '3991691', '3955509', '3991956', '3991957', '3942494', '3942485', '3942492', '3991637', '3909572', '3892272', '3892278', '3892279', '3915053', '3900039', '3900040', '3900041', '3900042', '3911873', '3898345', '3898346', '3898347', '3898349', '3898350', '3898351', '3898353', '3898354', '3898355', '3991913', '3900233', '3900234', '3900231', '3900232', '3900634', '3899302', '3899304', '3899306', '3899308', '3899311', '3899312', '3899313', '3899314', '3899315', '3899316', '3899310', '3992401', '3991390', '3913244', '3913245', '3913246', '3913241', '3913242', '3913243', '3917082', '3917083', '3917085', '3934139', '3936758', '3916861', '3985477', '3957086', '3927593', '3927599', '3937624', '3937623', '3937622', '3937621', '3938140', '3938141', '3938142', '3938144', '3938145', '3947433', '3983548', '3983549', '3983550', '3983551', '3983552', '3983547', '3927300', '3927249', '3927327', '3900375', '3900376', '3900377', '3900379', '3900378', '3900635', '3900636', '3900637', '3900638', '3900639', '3899907', '3962333', '3929605', '3929606', '3929607', '3929608', '3929604', '3929609', '3929611', '3929610', '3929612', '3929613', '3929614', '3908532', '3908539', '3908540', '3908534', '3908535', '3908536', '3908541', '3908533', '3908531', '3908537', '3908538', '3910755', '3910754', '3910757', '3910758', '3910759', '3910760', '3910761', '3910762', '3910763', '3910764', '3910765', '3909060', '3909061', '3909057', '3909059', '3909051', '3909052', '3909053', '3909054', '3909055', '3909056', '3909058', '3929127', '3929128', '3929129', '3929131', '3929133', '3929135', '3929130', '3929132', '3929134', '3929136', '3929137', '3920477', '3920478', '3920479', '3920480', '3920481', '3920484', '3920485', '3920486', '3920482', '3920483', '3920487', '3898928', '3898922', '3898923', '3898925', '3898926', '3898927', '3898929', '3898918', '3898919', '3898920', '3898921', '3958375', '3956835', '3986188', '3986130', '3986532', '3921855', '3992551', '3992537', '3992513', '3930157', '3930163', '3930158', '3930159', '3930160', '3930161', '3930162', '3930164', '3930165', '3930166', '3930167', '3979118', '3979116', '3979117', '3957084', '3959813', '3991914', '3896389', '3955668', '3913667', '3937625', '3956836', '3947562', '3956837', '3948394', '3901030', '3901031', '3901034', '3894003', '3991918', '3991916', '3991915', '3991917', '3902232', '3906052', '3902858', '3914163', '3992443', '3992442', '3916148', '3986704', '3986703', '3967667', '3917662', '3931537', '3905197', '3905198', '3905199', '3905194', '3992271', '3991959', '3921850', '3992514', '3992541', '3965864', '3893070', '3903043', '3893436', '3902859', '3896390', '3951736', '3991964', '3991963', '3991967', '3933810', '3907453', '3911877', '3939372', '3991924', '3991925', '3991927', '3991935', '3991919', '3991921', '3991923', '3991920', '3991933', '3991936', '3991922', '3991926', '3991928', '3991934', '3991900', '3926358', '3992444', '3985473', '3947435', '3967142', '3967136', '3967137', '3941619', '3923328', '3925676', '3991958', '3927592', '3910753', '3904563', '3909574', '3892275', '3893664', '3991937', '3914158', '3967141', '3965265', '3896391', '3915046', '3902499', '3902497', '3991938', '3914162', '3920915', '3992592', '3917663', '3965263', '3906054', '3950722', '3991215', '3893665', '3893666', '3893069', '3893072', '3893075', '3893076', '3907452', '3907450', '3911869', '3893437', '3950725', '3950726', '3950727', '3950723', '3950724', '3918081', '3990729', '3922250', '3907455', '3916149', '3992556', '3945865', '3933804', '3893662', '3952155', '3952531', '3909569', '3928720', '3928715', '3928718', '3928719', '3992603', '3928396', '3928397', '3928401', '3962927', '3962928', '3962924', '3991205', '3992552', '3992554', '3985893', '3915048', '3992605', '3992604', '3991216', '3925129', '3991596', '3991584', '3917942', '3992610', '3992609', '3992608', '3991206', '3940722', '3925137', '3925139', '3917939', '3992606', '3992593', '3991594', '3925133', '3992607', '3982531', '3982532', '3982530', '3992010', '3991997', '3992601', '3991982', '3990916', '3982533', '3991857', '3850605', '3850611', '3850613', '3850606', '3850610', '3850607', '3850612', '3850609', '3850608', '3984254', '3991888', '3990932', '3963105', '3982529', '3991889', '3991890', '3958024', '3991850', '3911156', '3911160', '3913669', '3913666']
  269. Traceback (most recent call last):
  270.  
  271. File "<ipython-input-9-77b4115b18b3>", line 266, in <module>
  272. m.scrape_match_information()
  273.  
  274. File "<ipython-input-9-77b4115b18b3>", line 191, in scrape_match_information
  275. matches_number = self.matches_number(away_away_team)
  276.  
  277. File "<ipython-input-9-77b4115b18b3>", line 80, in matches_number
  278. return int(re.findall(r'(\d+) match\(es\) in total', match_inf)[0])
  279.  
  280. IndexError: list index out of range
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement