Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- from openpyxl import load_workbook
- import openpyxl
- # Load the existing Excel workbook or create a new one if it doesn't exist
- file_path = "output21.xlsx"
- try:
- wb = load_workbook(file_path)
- ws = wb.active
- except FileNotFoundError:
- from openpyxl import Workbook
- wb = Workbook()
- ws = wb.active
- # Iterate over rows 3 to 5 in column D to get URLs
- for row in range(2, 3):
- url = ws[f'D{row}'].value
- if url:
- # Print the URL
- print(f"Processing URL from row {row}: {url}")
- # Define a User-Agent header to mimic a web browser
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ''(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
- # Send a GET request to the URL with headers
- response = requests.get(url, headers=headers)
- # Parse the HTML content
- soup = BeautifulSoup(response.content, 'html.parser')
- # Find the div with class "left race-results-header" and get its text
- header_div = soup.find('div', class_='left race-results-header')
- header_text = header_div.get_text(strip=True) if header_div else "Header not found"
- print(header_text)
- if ws[f'F{row}'].value is None:
- race_detail_divs = soup.find_all('div', class_='race-detail-container')
- # Find all divs with class "race-detail-container"
- race_detail_divs = soup.find_all('div', class_='race-detail-container')
- # Initialize column index starting from 'F'
- column_index = 6
- for Meet_Num, race_detail_div in enumerate(race_detail_divs):
- # Find all divs with class "left race-summary"
- left_header_texts = race_detail_div.find_all('div', class_='left race-summary')
- # Initialize lists to store selected meeting and time
- selected_meeting = []
- selected_time = []
- # Iterate through left_header_texts to extract race-number and race-time
- for left_header_text in left_header_texts:
- # Extract race-number from div with class "race-number"
- header_left = left_header_text.find('div', class_='race-number')
- if header_left:
- meeting_text = header_left.get_text(strip=True)
- meeting_text = meeting_text.replace(' ', '')
- selected_meeting.append(meeting_text)
- # Extract race-time from div with class "race-time"
- header_left2 = left_header_text.find('div', class_='race-time')
- if header_left2:
- time_text = header_left2.get_text(strip=True)
- time_text = time_text.replace(' ', '_')
- selected_time.append(time_text)
- # Find all divs with class "field" containing <label for="SelectedResultsForRace_RaceTypeName">
- fields = race_detail_div.find_all('div', class_='field')
- selected_fields = []
- selected_fields2 = []
- # Extract text from each matching field div
- for field in fields:
- label = field.find('label', attrs={'for': 'SelectedResultsForRace_RaceTypeName'})
- if label:
- field_text = field.get_text(strip=True)
- field_text = field_text.replace('Type', '#y:')
- field_text = field_text.replace(' ', '_')
- selected_fields.append(field_text)
- for field in fields:
- label = field.find('label', attrs={'for': 'SelectedResultsForRace_DistanceInMetres'})
- if label:
- field_text = field.get_text(strip=True)
- field_text = field_text.replace('Distance', '#d:')
- field_text = field_text.replace(' ', '')
- selected_fields2.append(field_text)
- # Concatenate all selected field texts with a space separator
- left_header_combined_a = "".join(selected_meeting)
- left_header_combined_b = "".join(selected_time)
- field_texts_combined_a = "".join(selected_fields)
- field_texts_combined_b = "".join(selected_fields2)
- header_texts_combined = f"{left_header_combined_a}^{left_header_combined_b}{field_texts_combined_a}{field_texts_combined_b}"
- # Combine all contents
- combined_header_content = f"{header_texts_combined}"
- # Initialize a list to store concatenated text contents for raceResultsTable dataTable
- result_content = []
- # Find the table with class "raceResultsTable dataTable"
- tables = soup.find_all('table', class_=['raceResultsTable', 'dataTable'])
- # Extract text content from each tr with style attribute within the table
- if tables:
- table = tables[Meet_Num]
- # Find all tr elements with style attribute within the table
- trs = table.find_all('tr', style=True)
- for tr in table.find_all('tr', style=True):
- # Find all td elements within the tr element
- tds = tr.find_all('td')
- # Extract text content from each td element and strip any leading/trailing whitespace
- td_texts = [td.get_text(strip=True) for td in tds]
- # Extract href attribute from any a tags within each td element
- td_hrefs = [td.find('a', href=True)['href'].strip() if td.find('a', href=True) else None for td in tds]
- td_URLs = 'https://fasttrack.grv.org.au'+td_hrefs[1]
- response = requests.get(td_URLs,headers=headers)
- # Check if the request was successful
- if response.status_code == 200:
- # Parse the HTML content of the page
- soup = BeautifulSoup(response.content, 'html.parser')
- # Find the parent div with the specified id
- main_content_div = soup.find('div', id='mainContentArea')
- if main_content_div:
- # Find all divs with the class 'display-value' within the mainContentArea div
- divs = main_content_div.find_all('div', class_='display-value')
- # Loop through each div and print the text content
- for div in divs:
- if len(divs) >= 5:
- # Get the 5th element (index 4)
- fifth_div = divs[4]
- # Extract the text content and strip any leading/trailing whitespace
- colour_div_text = fifth_div.get_text(strip=True)
- # Concatenate the text contents of all td elements in the tr with "#" and "$" separators
- if len(td_texts) >= 11:
- concatenated_content = f"#p:{td_texts[0]}#n:{td_texts[1]}@{colour_div_text}#a:{td_texts[2]}#b:{td_texts[3]}#r:{td_texts[4]}#w:{td_texts[5]}#s:{td_texts[6]}#q:{td_texts[7]}#t:{td_texts[8]}#m:{td_texts[9]}#o:{td_texts[10]}"
- concatenated_content = concatenated_content.replace(' ', '_')
- result_content.append(concatenated_content)
- concatenated_content_result = "".join(result_content)
- # Combine details
- combined_content = f"{combined_header_content}{concatenated_content_result}"
- else:
- combined_content = f"Content not found."
- ws.cell(row=row, column=column_index, value=combined_content)
- column_index += 1
- ws[f'E{row}'] = header_text
- # Save the workbook
- wb.save(file_path)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement