Fasttrackpythonscrapper1

import requests
from bs4 import BeautifulSoup
from openpyxl import load_workbook
import openpyxl
# Load the existing Excel workbook or create a new one if it doesn't exist
file_path = "output21.xlsx"
try:
    wb = load_workbook(file_path)
    ws = wb.active
except FileNotFoundError:
    from openpyxl import Workbook
    wb = Workbook()
    ws = wb.active

# Iterate over rows 3 to 5 in column D to get URLs
for row in range(2, 3):
    url = ws[f'D{row}'].value
    if url:
        # Print the URL
        print(f"Processing URL from row {row}: {url}")

        # Define a User-Agent header to mimic a web browser
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ''(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

        # Send a GET request to the URL with headers
        response = requests.get(url, headers=headers)

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the div with class "left race-results-header" and get its text
        header_div = soup.find('div', class_='left race-results-header')
        header_text = header_div.get_text(strip=True) if header_div else "Header not found"

        print(header_text)

        if ws[f'F{row}'].value is None:
            race_detail_divs = soup.find_all('div', class_='race-detail-container')

            # Find all divs with class "race-detail-container"
            race_detail_divs = soup.find_all('div', class_='race-detail-container')

            # Initialize column index starting from 'F'
            column_index = 6

            for Meet_Num, race_detail_div in enumerate(race_detail_divs):
                # Find all divs with class "left race-summary"
                left_header_texts = race_detail_div.find_all('div', class_='left race-summary')

                # Initialize lists to store selected meeting and time
                selected_meeting = []
                selected_time = []

                # Iterate through left_header_texts to extract race-number and race-time
                for left_header_text in left_header_texts:
                    # Extract race-number from div with class "race-number"
                    header_left = left_header_text.find('div', class_='race-number')
                    if header_left:
                        meeting_text = header_left.get_text(strip=True)
                        meeting_text = meeting_text.replace(' ', '')
                        selected_meeting.append(meeting_text)

                    # Extract race-time from div with class "race-time"
                    header_left2 = left_header_text.find('div', class_='race-time')
                    if header_left2:
                        time_text = header_left2.get_text(strip=True)
                        time_text = time_text.replace(' ', '_')
                        selected_time.append(time_text)

                # Find all divs with class "field" containing <label for="SelectedResultsForRace_RaceTypeName">
                fields = race_detail_div.find_all('div', class_='field')
                selected_fields = []
                selected_fields2 = []

                # Extract text from each matching field div
                for field in fields:
                    label = field.find('label', attrs={'for': 'SelectedResultsForRace_RaceTypeName'})
                    if label:
                        field_text = field.get_text(strip=True)
                        field_text = field_text.replace('Type', '#y:')
                        field_text = field_text.replace(' ', '_')
                        selected_fields.append(field_text)
                for field in fields:
                    label = field.find('label', attrs={'for': 'SelectedResultsForRace_DistanceInMetres'})
                    if label:
                        field_text = field.get_text(strip=True)
                        field_text = field_text.replace('Distance', '#d:')
                        field_text = field_text.replace(' ', '')
                        selected_fields2.append(field_text)

                # Concatenate all selected field texts with a space separator
                left_header_combined_a = "".join(selected_meeting)
                left_header_combined_b = "".join(selected_time)
                field_texts_combined_a = "".join(selected_fields)
                field_texts_combined_b = "".join(selected_fields2)

                header_texts_combined = f"{left_header_combined_a}^{left_header_combined_b}{field_texts_combined_a}{field_texts_combined_b}"

                # Combine all contents
                combined_header_content = f"{header_texts_combined}"

                # Initialize a list to store concatenated text contents for raceResultsTable dataTable
                result_content = []

                # Find the table with class "raceResultsTable dataTable"
                tables = soup.find_all('table', class_=['raceResultsTable', 'dataTable'])

                # Extract text content from each tr with style attribute within the table
                if tables:

                    table = tables[Meet_Num]

                    # Find all tr elements with style attribute within the table
                    trs = table.find_all('tr', style=True)

                    for tr in table.find_all('tr', style=True):
                        # Find all td elements within the tr element
                        tds = tr.find_all('td')

                        # Extract text content from each td element and strip any leading/trailing whitespace
                        td_texts = [td.get_text(strip=True) for td in tds]

                        # Extract href attribute from any a tags within each td element
                        td_hrefs = [td.find('a', href=True)['href'].strip() if td.find('a', href=True) else None for td in tds]

                        td_URLs = 'https://fasttrack.grv.org.au'+td_hrefs[1]

                        response = requests.get(td_URLs,headers=headers)

                        # Check if the request was successful
                        if response.status_code == 200:
                            # Parse the HTML content of the page
                            soup = BeautifulSoup(response.content, 'html.parser')

                            # Find the parent div with the specified id
                            main_content_div = soup.find('div', id='mainContentArea')

                            if main_content_div:
                                # Find all divs with the class 'display-value' within the mainContentArea div
                                divs = main_content_div.find_all('div', class_='display-value')

                                # Loop through each div and print the text content
                                for div in divs:

                                    if len(divs) >= 5:
                                        # Get the 5th element (index 4)
                                        fifth_div = divs[4]
                                        # Extract the text content and strip any leading/trailing whitespace
                                        colour_div_text = fifth_div.get_text(strip=True)

                        # Concatenate the text contents of all td elements in the tr with "#" and "$" separators
                        if len(td_texts) >= 11:
                            concatenated_content = f"#p:{td_texts[0]}#n:{td_texts[1]}@{colour_div_text}#a:{td_texts[2]}#b:{td_texts[3]}#r:{td_texts[4]}#w:{td_texts[5]}#s:{td_texts[6]}#q:{td_texts[7]}#t:{td_texts[8]}#m:{td_texts[9]}#o:{td_texts[10]}"
                            concatenated_content = concatenated_content.replace(' ', '_')
                            result_content.append(concatenated_content)
                            concatenated_content_result = "".join(result_content)

                    # Combine details
                    combined_content = f"{combined_header_content}{concatenated_content_result}"
                else:
                    combined_content = f"Content not found."

                ws.cell(row=row, column=column_index, value=combined_content)

                column_index += 1

            ws[f'E{row}'] = header_text

# Save the workbook
wb.save(file_path)