Advertisement
Guest User

Untitled

a guest
Oct 22nd, 2019
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.26 KB | None | 0 0
  1. # A function to open the raw text
  2. def open_file(file_path):
  3.  
  4. with open(file_path, 'r') as file:
  5. raw_text = file.read().replace('\n', ' ')
  6.  
  7. return raw_text
  8.  
  9.  
  10. # A function that takes in the raw text of a string, removes special characters and
  11. # stop words, and returns the script text as a list where each element represents a word
  12. def cleaned_episode(raw_text, stop_words = False):
  13.  
  14. #RegEx to delete all text between and including brackets and parenthesis
  15. raw_text_no_notes = re.sub("[\(\[].*?[\)\]]", "", raw_text)
  16.  
  17. #Removes any special characters
  18. for symbol in "*,#-.?!''\n":
  19. raw_text_no_notes = raw_text_no_notes.replace(symbol, '').lower()
  20.  
  21. #Splits the text into a list of words
  22. cleaned_text = raw_text_no_notes.split(" ")
  23.  
  24. #Removes any word that containes a colon, i.e. character speaking indicators
  25. #Deletes spaces and blank elements
  26. for i in cleaned_text:
  27.  
  28. if i.endswith(':') == True or i == '' or i == ' ':
  29. cleaned_text.remove(i)
  30.  
  31. #Removes any stop words passed as a list
  32. if stop_words:
  33.  
  34. cleaned_text = [word for word in cleaned_text if word.lower() not in stop_words]
  35.  
  36. #Returns the raw text as a list of words
  37. return cleaned_text
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement