Text Encoding Issue Example

import requests
from bs4 import BeautifulSoup

def get_text(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        return " ".join(s for s in soup.stripped_strings)
    else:
        return None

def extract(encoding, urls):
    url_values = urls
    i = 1
    for url in url_values:
        try:
            #RETRIEVE TEXT USING BS4
            text = get_text(url)
            #IF TEXT IS FOUND, WRITE IT AS IS TO A FILE FOR THAT SCRIPT
            if text:
                text = text.lower() #convert all to lowercase
                text = text.replace("::",":") #replace double ":" with single ":"
                text = text.replace('&nbsp',"") #remove &nbsp occurrences
                index = text.find('schrute space') #find "schrute space" which indicates the start of the script due to how website is arranged (elimanates menu text)
                text = text[index + len('schrute space'):] #extract everything after the occurrence of schrute space
                index = text.find('deleted scene') #find "deleted scene" which indicates the end of the main script
                text = text[:index] #trim text to be everything up to the "deleted scene" text occurrence

                #create dynamic file name
                filename = f'the_office_script_#{i}.txt'

                #Encode the text here
                text = text.encode(encoding)
                with open(filename,"wb") as file:
                    file.write(text)
            else:
                print("Failed to get text from", url)
        except AttributeError as e:
            print(f"No text found at: {url}. Error code: {e}")
            continue

        except requests.exceptions.RequestException as e:
            print(f"Error connecting to {url}: {e}")
            continue

        i += 1

encoding = 'utf-8'
urls = ['https://www.officequotes.net/no1-04.php','https://www.officequotes.net/no9-23.php']
extract(encoding, urls)