Advertisement
halsey845

Fasttrackpythonscrapper1

Jul 29th, 2024
29
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.42 KB | Software | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from openpyxl import load_workbook
  4. import openpyxl
  5. # Load the existing Excel workbook or create a new one if it doesn't exist
  6. file_path = "output21.xlsx"
  7. try:
  8. wb = load_workbook(file_path)
  9. ws = wb.active
  10. except FileNotFoundError:
  11. from openpyxl import Workbook
  12. wb = Workbook()
  13. ws = wb.active
  14.  
  15. # Iterate over rows 3 to 5 in column D to get URLs
  16. for row in range(2, 3):
  17. url = ws[f'D{row}'].value
  18. if url:
  19. # Print the URL
  20. print(f"Processing URL from row {row}: {url}")
  21.  
  22. # Define a User-Agent header to mimic a web browser
  23. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ''(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
  24.  
  25. # Send a GET request to the URL with headers
  26. response = requests.get(url, headers=headers)
  27.  
  28. # Parse the HTML content
  29. soup = BeautifulSoup(response.content, 'html.parser')
  30.  
  31. # Find the div with class "left race-results-header" and get its text
  32. header_div = soup.find('div', class_='left race-results-header')
  33. header_text = header_div.get_text(strip=True) if header_div else "Header not found"
  34.  
  35. print(header_text)
  36.  
  37. if ws[f'F{row}'].value is None:
  38. race_detail_divs = soup.find_all('div', class_='race-detail-container')
  39.  
  40. # Find all divs with class "race-detail-container"
  41. race_detail_divs = soup.find_all('div', class_='race-detail-container')
  42.  
  43. # Initialize column index starting from 'F'
  44. column_index = 6
  45.  
  46. for Meet_Num, race_detail_div in enumerate(race_detail_divs):
  47. # Find all divs with class "left race-summary"
  48. left_header_texts = race_detail_div.find_all('div', class_='left race-summary')
  49.  
  50. # Initialize lists to store selected meeting and time
  51. selected_meeting = []
  52. selected_time = []
  53.  
  54. # Iterate through left_header_texts to extract race-number and race-time
  55. for left_header_text in left_header_texts:
  56. # Extract race-number from div with class "race-number"
  57. header_left = left_header_text.find('div', class_='race-number')
  58. if header_left:
  59. meeting_text = header_left.get_text(strip=True)
  60. meeting_text = meeting_text.replace(' ', '')
  61. selected_meeting.append(meeting_text)
  62.  
  63. # Extract race-time from div with class "race-time"
  64. header_left2 = left_header_text.find('div', class_='race-time')
  65. if header_left2:
  66. time_text = header_left2.get_text(strip=True)
  67. time_text = time_text.replace(' ', '_')
  68. selected_time.append(time_text)
  69.  
  70. # Find all divs with class "field" containing <label for="SelectedResultsForRace_RaceTypeName">
  71. fields = race_detail_div.find_all('div', class_='field')
  72. selected_fields = []
  73. selected_fields2 = []
  74.  
  75. # Extract text from each matching field div
  76. for field in fields:
  77. label = field.find('label', attrs={'for': 'SelectedResultsForRace_RaceTypeName'})
  78. if label:
  79. field_text = field.get_text(strip=True)
  80. field_text = field_text.replace('Type', '#y:')
  81. field_text = field_text.replace(' ', '_')
  82. selected_fields.append(field_text)
  83. for field in fields:
  84. label = field.find('label', attrs={'for': 'SelectedResultsForRace_DistanceInMetres'})
  85. if label:
  86. field_text = field.get_text(strip=True)
  87. field_text = field_text.replace('Distance', '#d:')
  88. field_text = field_text.replace(' ', '')
  89. selected_fields2.append(field_text)
  90.  
  91. # Concatenate all selected field texts with a space separator
  92. left_header_combined_a = "".join(selected_meeting)
  93. left_header_combined_b = "".join(selected_time)
  94. field_texts_combined_a = "".join(selected_fields)
  95. field_texts_combined_b = "".join(selected_fields2)
  96.  
  97. header_texts_combined = f"{left_header_combined_a}^{left_header_combined_b}{field_texts_combined_a}{field_texts_combined_b}"
  98.  
  99. # Combine all contents
  100. combined_header_content = f"{header_texts_combined}"
  101.  
  102. # Initialize a list to store concatenated text contents for raceResultsTable dataTable
  103. result_content = []
  104.  
  105. # Find the table with class "raceResultsTable dataTable"
  106. tables = soup.find_all('table', class_=['raceResultsTable', 'dataTable'])
  107.  
  108. # Extract text content from each tr with style attribute within the table
  109. if tables:
  110.  
  111. table = tables[Meet_Num]
  112.  
  113. # Find all tr elements with style attribute within the table
  114. trs = table.find_all('tr', style=True)
  115.  
  116. for tr in table.find_all('tr', style=True):
  117. # Find all td elements within the tr element
  118. tds = tr.find_all('td')
  119.  
  120. # Extract text content from each td element and strip any leading/trailing whitespace
  121. td_texts = [td.get_text(strip=True) for td in tds]
  122.  
  123. # Extract href attribute from any a tags within each td element
  124. td_hrefs = [td.find('a', href=True)['href'].strip() if td.find('a', href=True) else None for td in tds]
  125.  
  126. td_URLs = 'https://fasttrack.grv.org.au'+td_hrefs[1]
  127.  
  128. response = requests.get(td_URLs,headers=headers)
  129.  
  130. # Check if the request was successful
  131. if response.status_code == 200:
  132. # Parse the HTML content of the page
  133. soup = BeautifulSoup(response.content, 'html.parser')
  134.  
  135. # Find the parent div with the specified id
  136. main_content_div = soup.find('div', id='mainContentArea')
  137.  
  138. if main_content_div:
  139. # Find all divs with the class 'display-value' within the mainContentArea div
  140. divs = main_content_div.find_all('div', class_='display-value')
  141.  
  142. # Loop through each div and print the text content
  143. for div in divs:
  144.  
  145. if len(divs) >= 5:
  146. # Get the 5th element (index 4)
  147. fifth_div = divs[4]
  148. # Extract the text content and strip any leading/trailing whitespace
  149. colour_div_text = fifth_div.get_text(strip=True)
  150.  
  151. # Concatenate the text contents of all td elements in the tr with "#" and "$" separators
  152. if len(td_texts) >= 11:
  153. concatenated_content = f"#p:{td_texts[0]}#n:{td_texts[1]}@{colour_div_text}#a:{td_texts[2]}#b:{td_texts[3]}#r:{td_texts[4]}#w:{td_texts[5]}#s:{td_texts[6]}#q:{td_texts[7]}#t:{td_texts[8]}#m:{td_texts[9]}#o:{td_texts[10]}"
  154. concatenated_content = concatenated_content.replace(' ', '_')
  155. result_content.append(concatenated_content)
  156. concatenated_content_result = "".join(result_content)
  157.  
  158. # Combine details
  159. combined_content = f"{combined_header_content}{concatenated_content_result}"
  160. else:
  161. combined_content = f"Content not found."
  162.  
  163. ws.cell(row=row, column=column_index, value=combined_content)
  164.  
  165. column_index += 1
  166.  
  167. ws[f'E{row}'] = header_text
  168.  
  169. # Save the workbook
  170. wb.save(file_path)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement