BART summarize long html documents

from transformers import BartForConditionalGeneration, BartTokenizer
import torch
import re
import html
import unicodedata
from newspaper import Article
from readability import Document
from bs4 import BeautifulSoup
import requests
import math


# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')
# Number of characters to overlap between chunks
overlap = 50
# Batch size for summarization (number of chunks to summarize at once)
batch_size = 25
# Maximum length of the input text for the model
model_max_length = 512
# model tokenizer
tokenizer = None
# model based on selected model name
model = None
# model name
model_name = None


# this class is used to define the summary length
class SummaryLength:
    SHORT = "short"
    MEDIUM = "medium"
    LONG = "long"


# this function is used to write the content and overwrite if exist
def overwrite_file(file_path: str, new_content: str):
    """Overwrite the file with the new content.

    Args:
        file_path: The path to the file to overwrite.
        new_content: The new content to write to the file.
    """
    # make sure new content is not empty
    if not new_content:
        print("Empty content supplied!")
        return

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(new_content)


# this function is used to clean the text content from any unnecessary characters and escape sequences
def clean_text(text_content: str):
    """Clean the text content from any unnecessary characters and escape sequences and replace them with their normal text representation.

    Args:
        text_content: The text content to clean.

    Returns:
        The cleaned text content.
    """
    # Convert HTML escaped characters to their normal text representation
    cleaned_text = html.unescape(text_content)

    # Normalize Unicode characters (replace special characters)
    cleaned_text = unicodedata.normalize('NFKC', cleaned_text)

    # Clean up remaining whitespace characters (replace non-breaking spaces, zero-width spaces, etc.)
    cleaned_text = cleaned_text.replace('\xa0', ' ')  # Replace non-breaking space with normal space
    cleaned_text = cleaned_text.replace('\u200b', ' ')  # Remove zero-width space
    cleaned_text = cleaned_text.replace('\u200c', ' ')  # Remove zero-width non-joiner
    cleaned_text = cleaned_text.replace('\u200d', ' ')  # Remove zero-width joiner
    cleaned_text = cleaned_text.replace('\uFEFF', ' ')  # Remove zero-width no-break space
    cleaned_text = cleaned_text.replace('\u00a0', ' ')  # Replace non-breaking space with normal space
    cleaned_text = cleaned_text.replace('\u3000', ' ')  # Replace ideographic space with normal space
    cleaned_text = cleaned_text.replace('©', '(c)') # © → "(c)"
    cleaned_text = cleaned_text.replace('®', '(R)') # ® → "(R)"
    cleaned_text = cleaned_text.replace('™', '(TM)') # ™ → "(TM)"
    # Handle smart quotes and other special punctuation
    cleaned_text = cleaned_text.replace('‘', "'").replace('’', "'")  # Curly single quotes to straight single quotes
    cleaned_text = cleaned_text.replace('“', '"').replace('”', '"')  # Curly double quotes to straight double quotes
    cleaned_text = cleaned_text.replace('–', '-').replace('—', '-')  # En dash and em dash to hyphen
    cleaned_text = cleaned_text.replace('…', '...')  # Ellipsis to three dots

    # replace multiple tabs with a single tab in spaces
    cleaned_text = re.sub(r'\t+', ' ', cleaned_text)

    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)

    # lines with only whitespace characters should be converted to empty lines
    cleaned_text = re.sub(r'^\s+$', '', cleaned_text, flags=re.MULTILINE)

    # Replace all carriage returns (\r) with newlines (\n)
    cleaned_text = re.sub(r'\r\n', '\n', cleaned_text)

    # replace multiple newlines with a single newline
    cleaned_text = re.sub(r'\n{3,}', '\n', cleaned_text)

    return cleaned_text.strip()


# this function is used to extract the main html content from the HTML page
def extract_main_html_content(html_content: str):
    """Extract the main html content from the HTML page, the readable part, without the menus, header, footer, ads etc.

    Args:
        html_content: The HTML page content to extract the main content from.

    Returns:
        The main html content extracted from the HTML page content.
    """
    # If the HTML content is empty, return an empty string
    if not html_content:
        print("Empty HTML content supplied!")
        return ""

    try:
        # Try newspaper first, Create an Article object (URL is not needed here since we're using raw HTML)
        article = Article(url="")

        # Set the article's HTML content
        article.set_html(html_content)

        # Parse the article (this step is necessary to extract information)
        article.parse()
        newspaper_text_main_content = article.text
    except Exception as err:
        print(f"Error parsing article: {err}")
        newspaper_text_main_content = None

    if newspaper_text_main_content:
        html_main_content = newspaper_text_main_content.strip()
        print("Newspaper3k worked!")
    else:
        try:
            # If newspaper fails, try Ruby Readability port to Python
            doc = Document(html_content)
            readability_html_main_content = doc.summary(html_partial=True)
        except Exception as err:
            print(f"Error parsing article with readability: {err}")
            readability_html_main_content = None

        if readability_html_main_content:
            html_main_content = readability_html_main_content.strip()
            print("Readability worked!")
        else:
            # If both fail, try a fallback approach
            print(f"Error parsing article with both approaches. Exiting.")
            exit(1)

    # Clean the HTML content
    cleaned_html_main_content = clean_text(html_main_content)

    return cleaned_html_main_content


# this function is used to convert HTML content to text content
def html_to_text(html_content: str):
    """Convert HTML content to text content.

    Args:
        html_content: The HTML content to convert to text content.

    Returns:
        The text content extracted from the HTML content.
    """
    # If the HTML content is empty, return an empty string
    if not html_content:
        print("Empty HTML content supplied!")
        return ""

    # extract the text content from the HTML content using BeautifulSoup and lxml parser
    soup = BeautifulSoup(html_content, "lxml")
    text_content = soup.get_text().strip()
    # clean the text content from any unnecessary characters and escape sequences
    cleaned_text_content = clean_text(text_content)

    return cleaned_text_content


def get_url_html(url):
    """Get the HTML content of the page from the given URL.

    Args:
        url: The URL of the page to fetch.

    Returns:
        The HTML content of the page as a string.
    """
    session = requests.Session()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Referer': 'https://www.google.com/',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    try:
        response = session.get(url, headers=headers)
        response.raise_for_status()  # Raise an error for bad responses
        return response.text
    except Exception as e:
        print(f"Error fetching URL {url}: {e}")
        return None


# Function to initialize the BART model and tokenizer
def init_bart():
    """Initialize the BART model and tokenizer."""
    global model, tokenizer,device,model_max_length,model_name
    # Load the BART model and tokenizer
    model_name = 'facebook/bart-large-cnn'
    model_max_length = 1024
    tokenizer = BartTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
    model = BartForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)


# Function to calculate the minimum and maximum length of the output summary
def calc_min_max_length(original_num_tokens: int, summary_length: str):
    """
    Calculate the minimum and maximum length of the output summary based on the original text length.

    Args:
        original_num_tokens: The number of tokens of the original text.
        summary_length: Desired length of the summary ('short', 'medium', 'long').

    Returns:
        The minimum and maximum length of the output summary.
    """
    if summary_length == SummaryLength.SHORT:
        min_length = min(40, original_num_tokens * 0.1)
        max_length = min(150, original_num_tokens * 0.15)
    elif summary_length == SummaryLength.MEDIUM:
        min_length = min(150, original_num_tokens * 0.2)
        max_length = min(250, original_num_tokens * 0.35)
    elif summary_length == SummaryLength.LONG:
        min_length = min(250, original_num_tokens * 0.4)
        max_length = min(450, original_num_tokens * 0.5)

    # make sure min_length and max_length are integers
    min_length = int(min_length)
    max_length = int(max_length)

    # Ensure that the minimum length is at least 30
    min_length = max(30, min_length)

    # Ensure that the maximum length is at least the minimum length
    max_length = max(min_length, max_length)

    return min_length, max_length


# Function to summarize the text content in chunks
def summarize_text(cleaned_text_content: str, summary_length: str, chunk_size: int):
    """
    Summarizes the cleaned text content using the BART model.

    Args:
        cleaned_text_content: The text content to summarize.
        summary_length: Desired length of the summary ('short', 'medium', 'long').
        chunk_size: The number of tokens in each chunk (including overlap) for summarization, must be less or equal to the model's max length.

    Returns:
        The summarized text content.
    """
    global model_max_length, overlap, batch_size, tokenizer

    # Initialize the summary token length
    num_chunks = 1

    # Tokenize the input text
    tokens = tokenizer.encode(cleaned_text_content, return_tensors='pt')

    # count the token length of the text
    original_num_tokens = tokens.shape[1]
    print(f'Original tokens length: {original_num_tokens}')

    # calculate the min & max summary length based on the TOTAL text length
    output_min_length, output_max_length = calc_min_max_length(original_num_tokens, summary_length)

    # Directly summarize if content tokens length is small enough
    if original_num_tokens <= model_max_length:
        print('Summarizing single chunk...')
        combined_summary = summarize_single_chunk(tokens=tokens, output_min_length=output_min_length, output_max_length=output_max_length)
        combined_summary = combined_summary[0]
    else:
        # Initialize summary variable
        final_summary = []

        # Calculate the number of total chunks and batches we will have in order to summarize the full text
        num_chunks = (original_num_tokens + chunk_size - 1) // chunk_size
        num_batches = math.ceil(num_chunks / batch_size)
        print(f'Number of chunks: {num_chunks}, Number of batches: {num_batches}')

        # Prepare chunks, the original text is split into chunks of size chunk_size and all chunks are stored in this list
        # there is an overlap of 'overlap' tokens between each chunk so that the model can summarize the text more effectively
        chunks = []
        for i in range(num_chunks):
            start_index = i * chunk_size
            end_index = min(start_index + chunk_size, original_num_tokens)
            chunk = tokens[:, start_index:end_index]
            # if chunk size is less than the model max length, pad it with padding tokens
            if chunk.shape[1] < model_max_length:
                padding_length = model_max_length - chunk.shape[1]
                chunk = torch.cat([chunk, torch.zeros((1, padding_length), dtype=torch.long)], dim=1)
            chunks.append(chunk)

        # calculate the min & max summary length based on the CHUNK text length
        chunk_output_min_length, chunk_output_max_length = calc_min_max_length(chunk.shape[1], summary_length)

        # Summarize each chunk and combine the summarized chunks into a single text
        for i in range(0, len(chunks), batch_size):
            # Get the chunks for the current batch
            batch_chunks = chunks[i:i + batch_size]
            # Combine the batch of chunks into a single tensor
            batched_input = torch.cat(batch_chunks, dim=0)
            summarized_chunks = summarize_single_chunk(tokens=batched_input, output_min_length=chunk_output_min_length, output_max_length=chunk_output_max_length)
            # combine the summarized chunks into the final summary
            final_summary.extend(summarized_chunks)

        # Combine all summarized chunks into a single text
        combined_summary = ' '.join(final_summary)
        print(f'Summary tokens length: {len(tokenizer.tokenize(combined_summary))}')

    return combined_summary


# Function to summarize a single chunk of text
def summarize_single_chunk(tokens: torch.Tensor, output_min_length: int, output_max_length: int):
    """
    Summarizes a single chunk of text tokens.

    Args:
        tokens: The tokenized input text as a PyTorch tensor.
        output_min_length: Minimum length of the output summary.
        output_max_length: Maximum length of the output summary.

    Returns:
        The summarized text as string.
    """
    global model, tokenizer, device, model_max_length

    # Tokenize the input text
    inputs = tokens.to(device)

    # Generate summary using the model and torch autocast (dynamic mixed precision)
    with torch.no_grad():
        with torch.autocast(device_type=device, dtype=torch.float16, enabled=True):
            summary_ids = model.generate(
                inputs,
                max_length=output_max_length,  # The maximum length of the output summary
                min_length=output_min_length,  # The minimum length of the output summary
                num_beams=4, # The number of top-scoring sequences to consider
                early_stopping=True,  # Stop the beam search when at least num_beams sentences are finished per batch for all batch indices
                length_penalty=1.2,  # Allow the model to use more content from the input
                repetition_penalty=2.0,  # Discourage repetition
                no_repeat_ngram_size=4,  # Avoid repetition of 4 words combination more than once
            )

    # Decode the summary
    summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

    return summary


# Main function to summarize the text content of a URL
if __name__ == '__main__':
    url = "https://www.atltranslate.com/ai/blog/text-summarization-tips"

    try:
        print(f"Fetching content from: {url}")
        html_content = get_url_html(url)
    except Exception as e:
        print(f"Error fetching URL {url}: {e}")
        html_content = None

    # check if html content is None or empty
    if html_content:
        # save the fetched content to a file
        overwrite_file('test_downloaded.html', html_content)

        # Extract the main text content from the HTML content
        cleaned_html_content = extract_main_html_content(html_content)
        cleaned_text_content = html_to_text(cleaned_html_content)

        # Save the cleaned text content to a new file
        overwrite_file('test_cleaned.txt', cleaned_text_content)

        # Check if there is any content to summarize
        if cleaned_text_content:
            init_bart()
            summary = summarize_text(cleaned_text_content, summary_length=SummaryLength.MEDIUM, chunk_size=model_max_length)
            summary = clean_text(summary)

            # Save the summary to a new file
            overwrite_file('test_summary.txt', summary)
            print(f"Summary saved to 'test_summary.txt'")
        else:
            print("No content remaining after extraction. nothing to summarize.")

beautifulsoup4==4.12.3
bitsandbytes==0.43.3
markdownify==0.13.1
newspaper3k==0.2.8
readability_lxml==0.8.1
Requests==2.32.3
torch==2.4.1+cu124
transformers==4.45.0.dev0

run:
    import nltk
    nltk.download('punkt_tab')