Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import io
- import codecs
- class preprocessor():
- def __init__(self):
- #init empty string
- self.bookString = ""
- def __str__(self):
- #return a string / present contents
- return self.bookString
- def clean(self):
- #TODO check if empty, else replace
- self.bookString = self.bookString.replace('-',' ')
- self.bookString = self.bookString.replace("'",'')
- self.bookString = self.bookString.replace('_',' ')
- self.bookString = self.bookString.replace('"',' ')
- self.bookString = self.bookString.replace('“', '')
- self.bookString = self.bookString.replace('”', '')
- self.bookString = self.bookString.replace("'",'')
- self.bookString = self.bookString.replace("’", '')
- def read_text(self, text_name):
- #add .txt format to the string of the filename
- fileName = text_name + ".txt"
- #read the file
- readd = io.open(fileName,mode= "r", encoding="utf-8")
- #add the string to the self.bookString variable which will
- #be used in all the other methods
- self.bookString = readd.read()
- #initialise object of class preprocessor
- process = preprocessor()
- #using read_text method of preprocessor
- process.read_text("1952-0")
- #print to terminal
- print(process.__str__())
- #an output file for debugging is created, written into and saved.
- text_file = open("Output.txt", "w")
- text_file.write(process.__str__())
- text_file.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement