Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def tokenize(f):
- # Skip headers
- data = f.read()
- eoh = data.find('\n\n')
- data = data[eoh+1:]
- data = data.lower()
- # More opportunities here as well if this turns out to be a good idea
- for char in ['\n','.','!',"'",':','?','@','=',',']:
- data = data.replace(char,' ')
- data = data.replace('><','> <')
- tokens = data.split()
- tokens = [t for t in tokens if len(t) > 0 and t not in stopwords]
- return tokens
Add Comment
Please, Sign In to add comment