Advertisement
Guest User

Untitled

a guest
Sep 25th, 2017
55
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.83 KB | None | 0 0
  1. def pretreatment(fileName):
  2. with open(fileName, 'r') as file:
  3. text = file.readlines()
  4.  
  5. corpus = []
  6.  
  7. for occurrence in text:
  8. occurrence = occurrence.replace('.',' ')
  9. occurrence = occurrence.replace(',',' ')
  10. occurrence = occurrence.replace(';',' ')
  11. occurrence = occurrence.replace('(',' ')
  12. occurrence = occurrence.replace(')',' ')
  13. occurrence = occurrence.replace('?',' ')
  14. occurrence = occurrence.replace('!',' ')
  15. occurrence = occurrence.replace(':',' ')
  16.  
  17. corpus.append(occurrence)
  18.  
  19. return corpus
  20.  
  21. def lexical_analysis(corpus):
  22.  
  23. lexical_corpus = pretreatment(corpus)
  24.  
  25. tokens = nltk.word_tokenize(lexical_corpus)
  26.  
  27. return tokens
  28.  
  29. print(pretreatment("blabla.txt"))
  30. print(lexical_analysis(corpus))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement