Advertisement
182days

Web Page Word Interrogation

Dec 19th, 2023 (edited)
657
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.39 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from urllib.parse import urlparse
  4. import re
  5. import string
  6.  
  7. #functions
  8. def extract_text_from_website(url):
  9.     headers = {
  10.         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
  11.     response = requests.get(url, headers=headers)
  12.     if response.status_code == 200:
  13.         soup = BeautifulSoup(response.text, 'html.parser')
  14.         all_text = soup.get_text(separator=' ')
  15.         lower_text = all_text.lower()
  16.         file = open(dump_file_name, "w", encoding="utf8")
  17.         file.write(lower_text)
  18.     else:
  19.         print(f"Failed to retrieve {url}. Status code: {response.status_code}")
  20.  
  21. def clean_text(input_file):
  22.     stop_words = ['a', 'an', 'the', 'and', 'but', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'you', 'we', 'be', 'are', 'is', 'this', 'if', 'which', 'it', 'do', 'also', 'us', 'may', 'their', 'put']
  23.     try:
  24.         with open(dump_file_name, 'r', encoding="utf8") as infile:
  25.             content = infile.read()
  26.             alpha_content = re.sub(r'[^A-Za-z\s]', '', content)
  27.             words = alpha_content.split()
  28.             non_stop_words = []
  29.             for word in words:
  30.               if word not in stop_words:
  31.                 non_stop_words.append(word)
  32.                 clean_content=' '.join(map(str,non_stop_words))
  33.         with open(dump_file_name, 'w', encoding="utf8") as outfile:
  34.             outfile.write(clean_content)
  35.     except Exception as e:
  36.         print(f"An error occurred: {str(e)}")
  37.  
  38. def generate_list():
  39.     word_count = 0
  40.     text = open(dump_file_name, "r", encoding="utf8")
  41.     d = dict()
  42.     for line in text:
  43.         #line = line.lower()
  44.         words = line.split(" ")
  45.     for word in words:
  46.         if word in d:
  47.             d[word] = d[word] + 1
  48.         else:
  49.             d[word] = 1
  50.             word_count = word_count + 1
  51.     ds = sorted(d.items(), key = lambda t:  t[1], reverse = True)
  52.     with open(dump_file_name, "w", encoding="utf8") as file:
  53.         file.write("Content Word Density Summary\n")
  54.         file.write("URL: "+(target_url)+"\n")
  55.         file.write("Word Count: "+str(word_count)+"\n")
  56.         file.write("----------------------------\n")
  57.         for items in ds:
  58.             file.write(f"{items}\n")
  59.  
  60. def extract_domain(target_url):
  61.     parsed_url = urlparse(target_url)
  62.     domain = parsed_url.netloc
  63.     return domain
  64.  
  65. target_url = 'https://en.wikipedia.org/wiki/Antarctica'
  66. domain = extract_domain(target_url)
  67. dump_file_name = ((domain)+'.txt')
  68. extract_text_from_website(target_url)
  69. clean_text(dump_file_name)
  70. generate_list()
  71.  
  72. #for development--------------------------------------    
  73. #word stats 1.5
  74. #remove stop words 1.6
  75. #remove brackets and quotes in report (1.7)
  76. #input to request the url
  77. #combine links (broken)
  78. #combine meta information
  79. #combine alt-text
  80. #add page depth (internal link following) option
  81. #gui version
  82.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement