Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- from urllib.parse import urlparse
- import re
- import string
- #functions
- def extract_text_from_website(url):
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
- response = requests.get(url, headers=headers)
- if response.status_code == 200:
- soup = BeautifulSoup(response.text, 'html.parser')
- all_text = soup.get_text(separator=' ')
- lower_text = all_text.lower()
- file = open(dump_file_name, "w", encoding="utf8")
- file.write(lower_text)
- else:
- print(f"Failed to retrieve {url}. Status code: {response.status_code}")
- def clean_text(input_file):
- stop_words = ['a', 'an', 'the', 'and', 'but', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'you', 'we', 'be', 'are', 'is', 'this', 'if', 'which', 'it', 'do', 'also', 'us', 'may', 'their', 'put']
- try:
- with open(dump_file_name, 'r', encoding="utf8") as infile:
- content = infile.read()
- alpha_content = re.sub(r'[^A-Za-z\s]', '', content)
- words = alpha_content.split()
- non_stop_words = []
- for word in words:
- if word not in stop_words:
- non_stop_words.append(word)
- clean_content=' '.join(map(str,non_stop_words))
- with open(dump_file_name, 'w', encoding="utf8") as outfile:
- outfile.write(clean_content)
- except Exception as e:
- print(f"An error occurred: {str(e)}")
- def generate_list():
- word_count = 0
- text = open(dump_file_name, "r", encoding="utf8")
- d = dict()
- for line in text:
- #line = line.lower()
- words = line.split(" ")
- for word in words:
- if word in d:
- d[word] = d[word] + 1
- else:
- d[word] = 1
- word_count = word_count + 1
- ds = sorted(d.items(), key = lambda t: t[1], reverse = True)
- with open(dump_file_name, "w", encoding="utf8") as file:
- file.write("Content Word Density Summary\n")
- file.write("URL: "+(target_url)+"\n")
- file.write("Word Count: "+str(word_count)+"\n")
- file.write("----------------------------\n")
- for items in ds:
- file.write(f"{items}\n")
- def extract_domain(target_url):
- parsed_url = urlparse(target_url)
- domain = parsed_url.netloc
- return domain
- target_url = 'https://en.wikipedia.org/wiki/Antarctica'
- domain = extract_domain(target_url)
- dump_file_name = ((domain)+'.txt')
- extract_text_from_website(target_url)
- clean_text(dump_file_name)
- generate_list()
- #for development--------------------------------------
- #word stats 1.5
- #remove stop words 1.6
- #remove brackets and quotes in report (1.7)
- #input to request the url
- #combine links (broken)
- #combine meta information
- #combine alt-text
- #add page depth (internal link following) option
- #gui version
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement