Untitled

from bs4 import BeautifulSoup

with open("macbeth.txt", "r") as f:
    lines = f.read().splitlines()

def keep(line):
    blacklist = ["<blockquote>", "</blockquote>", "<A NAME=speech", "<body>", "</body>", "<html>", "</html>", "<p>"]
    for term in blacklist:
        if term in line:
            return False
    if line == "" or line == " ":
        return False
    return True

def stripTags(value):
    soup = BeautifulSoup(value, "html.parser")
    for tag in soup.findAll(True):
        tag.hidden = True
    return soup.renderContents()

lines = [ line for line in lines if keep(line) ]

punctuation = [".", ",", "!", "?", ";"]

sanitized = []

for line in lines:
    line = stripTags(line)
    line = ' '.join(line.split())
    if line[-1] not in punctuation:
        line += ". "
    sanitized.append(line)

with open("macbeth_sanitized", "w+") as f:
    f.writelines(sanitized)