import requests from bs4 import BeautifulSoup import pandas as pd import re # Function to get the webpage content def get_webpage_content(url): try: response = requests.get(url) response.raise_for_status() # Raise an error for bad responses return response.text except requests.exceptions.RequestException as e: print(f"Error fetching the URL: {e}") return None # Function to read keywords from a file def read_keywords_from_file(filename): try: with open(filename, 'r', encoding='utf-8') as file: keywords = [line.strip().lower() for line in file if line.strip()] return keywords except FileNotFoundError: print(f"File {filename} not found.") return [] # Function to count keyword occurrences accurately using regular expressions def count_keywords(content, keywords): # Remove HTML tags using BeautifulSoup soup = BeautifulSoup(content, 'html.parser') text = soup.get_text().lower() # Convert text to lowercase for case-insensitive search # Dictionary to store keyword counts keyword_count = {} # Count occurrences of each keyword using regex for keyword in keywords: # Create a regex pattern to match the keyword as a whole word pattern = rf'\b{re.escape(keyword)}\b' count = len(re.findall(pattern, text)) keyword_count[keyword] = count return keyword_count # Function to export the result to CSV def export_to_csv(keyword_count, filename): df = pd.DataFrame(list(keyword_count.items()), columns=['Keyword', 'Count']) df.to_csv(filename, index=False, encoding='utf-8') print(f"Keyword counts have been exported to {filename}") # Main execution if __name__ == "__main__": # Read keywords from the file keywords = read_keywords_from_file('keywords.txt') # URL of the webpage to analyze url = 'https://alllandlordcertificates.co.uk/fire-risk-assessment' # Replace with your desired URL # Fetch the webpage content content = get_webpage_content(url) if content and keywords: # Count keywords keyword_count = count_keywords(content, keywords) # Export result to CSV export_to_csv(keyword_count, 'keyword_counts.csv')