Advertisement
Guest User

Untitled

a guest
May 1st, 2016
59
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.90 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2.  
  3. with open("macbeth.txt", "r") as f:
  4. lines = f.read().splitlines()
  5.  
  6. def keep(line):
  7. blacklist = ["<blockquote>", "</blockquote>", "<A NAME=speech", "<body>", "</body>", "<html>", "</html>", "<p>"]
  8. for term in blacklist:
  9. if term in line:
  10. return False
  11. if line == "" or line == " ":
  12. return False
  13. return True
  14.  
  15. def stripTags(value):
  16. soup = BeautifulSoup(value, "html.parser")
  17. for tag in soup.findAll(True):
  18. tag.hidden = True
  19. return soup.renderContents()
  20.  
  21. lines = [ line for line in lines if keep(line) ]
  22.  
  23. punctuation = [".", ",", "!", "?", ";"]
  24.  
  25. sanitized = []
  26.  
  27. for line in lines:
  28. line = stripTags(line)
  29. line = ' '.join(line.split())
  30. if line[-1] not in punctuation:
  31. line += ". "
  32. sanitized.append(line)
  33.  
  34. with open("macbeth_sanitized", "w+") as f:
  35. f.writelines(sanitized)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement