Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # A function to open the raw text
- def open_file(file_path):
- with open(file_path, 'r') as file:
- raw_text = file.read().replace('\n', ' ')
- return raw_text
- # A function that takes in the raw text of a string, removes special characters and
- # stop words, and returns the script text as a list where each element represents a word
- def cleaned_episode(raw_text, stop_words = False):
- #RegEx to delete all text between and including brackets and parenthesis
- raw_text_no_notes = re.sub("[\(\[].*?[\)\]]", "", raw_text)
- #Removes any special characters
- for symbol in "*,#-.?!''\n":
- raw_text_no_notes = raw_text_no_notes.replace(symbol, '').lower()
- #Splits the text into a list of words
- cleaned_text = raw_text_no_notes.split(" ")
- #Removes any word that containes a colon, i.e. character speaking indicators
- #Deletes spaces and blank elements
- for i in cleaned_text:
- if i.endswith(':') == True or i == '' or i == ' ':
- cleaned_text.remove(i)
- #Removes any stop words passed as a list
- if stop_words:
- cleaned_text = [word for word in cleaned_text if word.lower() not in stop_words]
- #Returns the raw text as a list of words
- return cleaned_text
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement