Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from transformers import BartForConditionalGeneration, BartTokenizer
- import torch
- import re
- import html
- import unicodedata
- from newspaper import Article
- from readability import Document
- from bs4 import BeautifulSoup
- import requests
- import math
- # Check if GPU is available
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
- print(f'Using device: {device}')
- # Number of characters to overlap between chunks
- overlap = 50
- # Batch size for summarization (number of chunks to summarize at once)
- batch_size = 25
- # Maximum length of the input text for the model
- model_max_length = 512
- # model tokenizer
- tokenizer = None
- # model based on selected model name
- model = None
- # model name
- model_name = None
- # this class is used to define the summary length
- class SummaryLength:
- SHORT = "short"
- MEDIUM = "medium"
- LONG = "long"
- # this function is used to write the content and overwrite if exist
- def overwrite_file(file_path: str, new_content: str):
- """Overwrite the file with the new content.
- Args:
- file_path: The path to the file to overwrite.
- new_content: The new content to write to the file.
- """
- # make sure new content is not empty
- if not new_content:
- print("Empty content supplied!")
- return
- with open(file_path, 'w', encoding='utf-8') as file:
- file.write(new_content)
- # this function is used to clean the text content from any unnecessary characters and escape sequences
- def clean_text(text_content: str):
- """Clean the text content from any unnecessary characters and escape sequences and replace them with their normal text representation.
- Args:
- text_content: The text content to clean.
- Returns:
- The cleaned text content.
- """
- # Convert HTML escaped characters to their normal text representation
- cleaned_text = html.unescape(text_content)
- # Normalize Unicode characters (replace special characters)
- cleaned_text = unicodedata.normalize('NFKC', cleaned_text)
- # Clean up remaining whitespace characters (replace non-breaking spaces, zero-width spaces, etc.)
- cleaned_text = cleaned_text.replace('\xa0', ' ') # Replace non-breaking space with normal space
- cleaned_text = cleaned_text.replace('\u200b', ' ') # Remove zero-width space
- cleaned_text = cleaned_text.replace('\u200c', ' ') # Remove zero-width non-joiner
- cleaned_text = cleaned_text.replace('\u200d', ' ') # Remove zero-width joiner
- cleaned_text = cleaned_text.replace('\uFEFF', ' ') # Remove zero-width no-break space
- cleaned_text = cleaned_text.replace('\u00a0', ' ') # Replace non-breaking space with normal space
- cleaned_text = cleaned_text.replace('\u3000', ' ') # Replace ideographic space with normal space
- cleaned_text = cleaned_text.replace('©', '(c)') # © → "(c)"
- cleaned_text = cleaned_text.replace('®', '(R)') # ® → "(R)"
- cleaned_text = cleaned_text.replace('™', '(TM)') # ™ → "(TM)"
- # Handle smart quotes and other special punctuation
- cleaned_text = cleaned_text.replace('‘', "'").replace('’', "'") # Curly single quotes to straight single quotes
- cleaned_text = cleaned_text.replace('“', '"').replace('”', '"') # Curly double quotes to straight double quotes
- cleaned_text = cleaned_text.replace('–', '-').replace('—', '-') # En dash and em dash to hyphen
- cleaned_text = cleaned_text.replace('…', '...') # Ellipsis to three dots
- # replace multiple tabs with a single tab in spaces
- cleaned_text = re.sub(r'\t+', ' ', cleaned_text)
- # Replace multiple spaces with a single space
- cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
- # lines with only whitespace characters should be converted to empty lines
- cleaned_text = re.sub(r'^\s+$', '', cleaned_text, flags=re.MULTILINE)
- # Replace all carriage returns (\r) with newlines (\n)
- cleaned_text = re.sub(r'\r\n', '\n', cleaned_text)
- # replace multiple newlines with a single newline
- cleaned_text = re.sub(r'\n{3,}', '\n', cleaned_text)
- return cleaned_text.strip()
- # this function is used to extract the main html content from the HTML page
- def extract_main_html_content(html_content: str):
- """Extract the main html content from the HTML page, the readable part, without the menus, header, footer, ads etc.
- Args:
- html_content: The HTML page content to extract the main content from.
- Returns:
- The main html content extracted from the HTML page content.
- """
- # If the HTML content is empty, return an empty string
- if not html_content:
- print("Empty HTML content supplied!")
- return ""
- try:
- # Try newspaper first, Create an Article object (URL is not needed here since we're using raw HTML)
- article = Article(url="")
- # Set the article's HTML content
- article.set_html(html_content)
- # Parse the article (this step is necessary to extract information)
- article.parse()
- newspaper_text_main_content = article.text
- except Exception as err:
- print(f"Error parsing article: {err}")
- newspaper_text_main_content = None
- if newspaper_text_main_content:
- html_main_content = newspaper_text_main_content.strip()
- print("Newspaper3k worked!")
- else:
- try:
- # If newspaper fails, try Ruby Readability port to Python
- doc = Document(html_content)
- readability_html_main_content = doc.summary(html_partial=True)
- except Exception as err:
- print(f"Error parsing article with readability: {err}")
- readability_html_main_content = None
- if readability_html_main_content:
- html_main_content = readability_html_main_content.strip()
- print("Readability worked!")
- else:
- # If both fail, try a fallback approach
- print(f"Error parsing article with both approaches. Exiting.")
- exit(1)
- # Clean the HTML content
- cleaned_html_main_content = clean_text(html_main_content)
- return cleaned_html_main_content
- # this function is used to convert HTML content to text content
- def html_to_text(html_content: str):
- """Convert HTML content to text content.
- Args:
- html_content: The HTML content to convert to text content.
- Returns:
- The text content extracted from the HTML content.
- """
- # If the HTML content is empty, return an empty string
- if not html_content:
- print("Empty HTML content supplied!")
- return ""
- # extract the text content from the HTML content using BeautifulSoup and lxml parser
- soup = BeautifulSoup(html_content, "lxml")
- text_content = soup.get_text().strip()
- # clean the text content from any unnecessary characters and escape sequences
- cleaned_text_content = clean_text(text_content)
- return cleaned_text_content
- def get_url_html(url):
- """Get the HTML content of the page from the given URL.
- Args:
- url: The URL of the page to fetch.
- Returns:
- The HTML content of the page as a string.
- """
- session = requests.Session()
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Referer': 'https://www.google.com/',
- 'Accept-Language': 'en-US,en;q=0.5'
- }
- try:
- response = session.get(url, headers=headers)
- response.raise_for_status() # Raise an error for bad responses
- return response.text
- except Exception as e:
- print(f"Error fetching URL {url}: {e}")
- return None
- # Function to initialize the BART model and tokenizer
- def init_bart():
- """Initialize the BART model and tokenizer."""
- global model, tokenizer,device,model_max_length,model_name
- # Load the BART model and tokenizer
- model_name = 'facebook/bart-large-cnn'
- model_max_length = 1024
- tokenizer = BartTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
- model = BartForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)
- # Function to calculate the minimum and maximum length of the output summary
- def calc_min_max_length(original_num_tokens: int, summary_length: str):
- """
- Calculate the minimum and maximum length of the output summary based on the original text length.
- Args:
- original_num_tokens: The number of tokens of the original text.
- summary_length: Desired length of the summary ('short', 'medium', 'long').
- Returns:
- The minimum and maximum length of the output summary.
- """
- if summary_length == SummaryLength.SHORT:
- min_length = min(40, original_num_tokens * 0.1)
- max_length = min(150, original_num_tokens * 0.15)
- elif summary_length == SummaryLength.MEDIUM:
- min_length = min(150, original_num_tokens * 0.2)
- max_length = min(250, original_num_tokens * 0.35)
- elif summary_length == SummaryLength.LONG:
- min_length = min(250, original_num_tokens * 0.4)
- max_length = min(450, original_num_tokens * 0.5)
- # make sure min_length and max_length are integers
- min_length = int(min_length)
- max_length = int(max_length)
- # Ensure that the minimum length is at least 30
- min_length = max(30, min_length)
- # Ensure that the maximum length is at least the minimum length
- max_length = max(min_length, max_length)
- return min_length, max_length
- # Function to summarize the text content in chunks
- def summarize_text(cleaned_text_content: str, summary_length: str, chunk_size: int):
- """
- Summarizes the cleaned text content using the BART model.
- Args:
- cleaned_text_content: The text content to summarize.
- summary_length: Desired length of the summary ('short', 'medium', 'long').
- chunk_size: The number of tokens in each chunk (including overlap) for summarization, must be less or equal to the model's max length.
- Returns:
- The summarized text content.
- """
- global model_max_length, overlap, batch_size, tokenizer
- # Initialize the summary token length
- num_chunks = 1
- # Tokenize the input text
- tokens = tokenizer.encode(cleaned_text_content, return_tensors='pt')
- # count the token length of the text
- original_num_tokens = tokens.shape[1]
- print(f'Original tokens length: {original_num_tokens}')
- # calculate the min & max summary length based on the TOTAL text length
- output_min_length, output_max_length = calc_min_max_length(original_num_tokens, summary_length)
- # Directly summarize if content tokens length is small enough
- if original_num_tokens <= model_max_length:
- print('Summarizing single chunk...')
- combined_summary = summarize_single_chunk(tokens=tokens, output_min_length=output_min_length, output_max_length=output_max_length)
- combined_summary = combined_summary[0]
- else:
- # Initialize summary variable
- final_summary = []
- # Calculate the number of total chunks and batches we will have in order to summarize the full text
- num_chunks = (original_num_tokens + chunk_size - 1) // chunk_size
- num_batches = math.ceil(num_chunks / batch_size)
- print(f'Number of chunks: {num_chunks}, Number of batches: {num_batches}')
- # Prepare chunks, the original text is split into chunks of size chunk_size and all chunks are stored in this list
- # there is an overlap of 'overlap' tokens between each chunk so that the model can summarize the text more effectively
- chunks = []
- for i in range(num_chunks):
- start_index = i * chunk_size
- end_index = min(start_index + chunk_size, original_num_tokens)
- chunk = tokens[:, start_index:end_index]
- # if chunk size is less than the model max length, pad it with padding tokens
- if chunk.shape[1] < model_max_length:
- padding_length = model_max_length - chunk.shape[1]
- chunk = torch.cat([chunk, torch.zeros((1, padding_length), dtype=torch.long)], dim=1)
- chunks.append(chunk)
- # calculate the min & max summary length based on the CHUNK text length
- chunk_output_min_length, chunk_output_max_length = calc_min_max_length(chunk.shape[1], summary_length)
- # Summarize each chunk and combine the summarized chunks into a single text
- for i in range(0, len(chunks), batch_size):
- # Get the chunks for the current batch
- batch_chunks = chunks[i:i + batch_size]
- # Combine the batch of chunks into a single tensor
- batched_input = torch.cat(batch_chunks, dim=0)
- summarized_chunks = summarize_single_chunk(tokens=batched_input, output_min_length=chunk_output_min_length, output_max_length=chunk_output_max_length)
- # combine the summarized chunks into the final summary
- final_summary.extend(summarized_chunks)
- # Combine all summarized chunks into a single text
- combined_summary = ' '.join(final_summary)
- print(f'Summary tokens length: {len(tokenizer.tokenize(combined_summary))}')
- return combined_summary
- # Function to summarize a single chunk of text
- def summarize_single_chunk(tokens: torch.Tensor, output_min_length: int, output_max_length: int):
- """
- Summarizes a single chunk of text tokens.
- Args:
- tokens: The tokenized input text as a PyTorch tensor.
- output_min_length: Minimum length of the output summary.
- output_max_length: Maximum length of the output summary.
- Returns:
- The summarized text as string.
- """
- global model, tokenizer, device, model_max_length
- # Tokenize the input text
- inputs = tokens.to(device)
- # Generate summary using the model and torch autocast (dynamic mixed precision)
- with torch.no_grad():
- with torch.autocast(device_type=device, dtype=torch.float16, enabled=True):
- summary_ids = model.generate(
- inputs,
- max_length=output_max_length, # The maximum length of the output summary
- min_length=output_min_length, # The minimum length of the output summary
- num_beams=4, # The number of top-scoring sequences to consider
- early_stopping=True, # Stop the beam search when at least num_beams sentences are finished per batch for all batch indices
- length_penalty=1.2, # Allow the model to use more content from the input
- repetition_penalty=2.0, # Discourage repetition
- no_repeat_ngram_size=4, # Avoid repetition of 4 words combination more than once
- )
- # Decode the summary
- summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
- return summary
- # Main function to summarize the text content of a URL
- if __name__ == '__main__':
- url = "https://www.atltranslate.com/ai/blog/text-summarization-tips"
- try:
- print(f"Fetching content from: {url}")
- html_content = get_url_html(url)
- except Exception as e:
- print(f"Error fetching URL {url}: {e}")
- html_content = None
- # check if html content is None or empty
- if html_content:
- # save the fetched content to a file
- overwrite_file('test_downloaded.html', html_content)
- # Extract the main text content from the HTML content
- cleaned_html_content = extract_main_html_content(html_content)
- cleaned_text_content = html_to_text(cleaned_html_content)
- # Save the cleaned text content to a new file
- overwrite_file('test_cleaned.txt', cleaned_text_content)
- # Check if there is any content to summarize
- if cleaned_text_content:
- init_bart()
- summary = summarize_text(cleaned_text_content, summary_length=SummaryLength.MEDIUM, chunk_size=model_max_length)
- summary = clean_text(summary)
- # Save the summary to a new file
- overwrite_file('test_summary.txt', summary)
- print(f"Summary saved to 'test_summary.txt'")
- else:
- print("No content remaining after extraction. nothing to summarize.")
Advertisement
Comments
-
- beautifulsoup4==4.12.3
- bitsandbytes==0.43.3
- markdownify==0.13.1
- newspaper3k==0.2.8
- readability_lxml==0.8.1
- Requests==2.32.3
- torch==2.4.1+cu124
- transformers==4.45.0.dev0
- run:
- import nltk
- nltk.download('punkt_tab')
Add Comment
Please, Sign In to add comment
Advertisement