import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Function to get the webpage content
def get_webpage_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return None

# Function to read keywords from a file
def read_keywords_from_file(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            keywords = [line.strip().lower() for line in file if line.strip()]
        return keywords
    except FileNotFoundError:
        print(f"File {filename} not found.")
        return []

# Function to count keyword occurrences accurately using regular expressions
def count_keywords(content, keywords):
    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.get_text().lower()  # Convert text to lowercase for case-insensitive search

    # Dictionary to store keyword counts
    keyword_count = {}

    # Count occurrences of each keyword using regex
    for keyword in keywords:
        # Create a regex pattern to match the keyword as a whole word
        pattern = rf'\b{re.escape(keyword)}\b'
        count = len(re.findall(pattern, text))
        keyword_count[keyword] = count

    return keyword_count

# Function to export the result to CSV
def export_to_csv(keyword_count, filename):
    df = pd.DataFrame(list(keyword_count.items()), columns=['Keyword', 'Count'])
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"Keyword counts have been exported to {filename}")

# Main execution
if __name__ == "__main__":
    # Read keywords from the file
    keywords = read_keywords_from_file('keywords.txt')

    # URL of the webpage to analyze
    url = 'https://alllandlordcertificates.co.uk/fire-risk-assessment'  # Replace with your desired URL

    # Fetch the webpage content
    content = get_webpage_content(url)

    if content and keywords:
        # Count keywords
        keyword_count = count_keywords(content, keywords)

        # Export result to CSV
        export_to_csv(keyword_count, 'keyword_counts.csv')