Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- with open("macbeth.txt", "r") as f:
- lines = f.read().splitlines()
- def keep(line):
- blacklist = ["<blockquote>", "</blockquote>", "<A NAME=speech", "<body>", "</body>", "<html>", "</html>", "<p>"]
- for term in blacklist:
- if term in line:
- return False
- if line == "" or line == " ":
- return False
- return True
- def stripTags(value):
- soup = BeautifulSoup(value, "html.parser")
- for tag in soup.findAll(True):
- tag.hidden = True
- return soup.renderContents()
- lines = [ line for line in lines if keep(line) ]
- punctuation = [".", ",", "!", "?", ";"]
- sanitized = []
- for line in lines:
- line = stripTags(line)
- line = ' '.join(line.split())
- if line[-1] not in punctuation:
- line += ". "
- sanitized.append(line)
- with open("macbeth_sanitized", "w+") as f:
- f.writelines(sanitized)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement