Advertisement
Guest User

Text Encoding Issue Example

a guest
Feb 19th, 2023
117
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.06 KB | Source Code | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3.  
  4. def get_text(url):
  5.     response = requests.get(url)
  6.     if response.status_code == 200:
  7.         soup = BeautifulSoup(response.content, "html.parser")
  8.         return " ".join(s for s in soup.stripped_strings)
  9.     else:
  10.         return None
  11.  
  12. def extract(encoding, urls):
  13.     url_values = urls
  14.     i = 1
  15.     for url in url_values:
  16.         try:
  17.             #RETRIEVE TEXT USING BS4
  18.             text = get_text(url)
  19.             #IF TEXT IS FOUND, WRITE IT AS IS TO A FILE FOR THAT SCRIPT
  20.             if text:
  21.                 text = text.lower() #convert all to lowercase
  22.                 text = text.replace("::",":") #replace double ":" with single ":"
  23.                 text = text.replace('&nbsp',"") #remove &nbsp occurrences
  24.                 index = text.find('schrute space') #find "schrute space" which indicates the start of the script due to how website is arranged (elimanates menu text)
  25.                 text = text[index + len('schrute space'):] #extract everything after the occurrence of schrute space
  26.                 index = text.find('deleted scene') #find "deleted scene" which indicates the end of the main script
  27.                 text = text[:index] #trim text to be everything up to the "deleted scene" text occurrence
  28.                
  29.                 #create dynamic file name
  30.                 filename = f'the_office_script_#{i}.txt'
  31.                
  32.                 #Encode the text here
  33.                 text = text.encode(encoding)
  34.                 with open(filename,"wb") as file:
  35.                     file.write(text)
  36.             else:
  37.                 print("Failed to get text from", url)
  38.         except AttributeError as e:
  39.             print(f"No text found at: {url}. Error code: {e}")
  40.             continue
  41.        
  42.         except requests.exceptions.RequestException as e:
  43.             print(f"Error connecting to {url}: {e}")
  44.             continue
  45.  
  46.         i += 1
  47.  
  48. encoding = 'utf-8'
  49. urls = ['https://www.officequotes.net/no1-04.php','https://www.officequotes.net/no9-23.php']
  50. extract(encoding, urls)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement