Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- def get_text(url):
- response = requests.get(url)
- if response.status_code == 200:
- soup = BeautifulSoup(response.content, "html.parser")
- return " ".join(s for s in soup.stripped_strings)
- else:
- return None
- def extract(encoding, urls):
- url_values = urls
- i = 1
- for url in url_values:
- try:
- #RETRIEVE TEXT USING BS4
- text = get_text(url)
- #IF TEXT IS FOUND, WRITE IT AS IS TO A FILE FOR THAT SCRIPT
- if text:
- text = text.lower() #convert all to lowercase
- text = text.replace("::",":") #replace double ":" with single ":"
- text = text.replace(' ',"") #remove   occurrences
- index = text.find('schrute space') #find "schrute space" which indicates the start of the script due to how website is arranged (elimanates menu text)
- text = text[index + len('schrute space'):] #extract everything after the occurrence of schrute space
- index = text.find('deleted scene') #find "deleted scene" which indicates the end of the main script
- text = text[:index] #trim text to be everything up to the "deleted scene" text occurrence
- #create dynamic file name
- filename = f'the_office_script_#{i}.txt'
- #Encode the text here
- text = text.encode(encoding)
- with open(filename,"wb") as file:
- file.write(text)
- else:
- print("Failed to get text from", url)
- except AttributeError as e:
- print(f"No text found at: {url}. Error code: {e}")
- continue
- except requests.exceptions.RequestException as e:
- print(f"Error connecting to {url}: {e}")
- continue
- i += 1
- encoding = 'utf-8'
- urls = ['https://www.officequotes.net/no1-04.php','https://www.officequotes.net/no9-23.php']
- extract(encoding, urls)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement