Advertisement
Guest User

Untitled

a guest
Nov 18th, 2019
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.35 KB | None | 0 0
  1. # Exercice 1 -
  2. #1 - Tokenisation = separer la ponctuation des mots
  3. import nltk
  4. import pprint
  5. import treetaggerwrapper
  6.  
  7. def tokenisation(file,name):
  8. file_content = open(file,encoding="utf8")
  9. tokens = nltk.word_tokenize(file_content.read())
  10. new_file = open("./tok/English/"+name+".tok",'w')
  11. print("test")
  12. print(file_content.read())
  13. for line in file_content.read():
  14. print(line)
  15. tokens = nltk.word_tokenize(line)
  16. print(tokens)
  17. for word in tokens:
  18. new_file.write(word+ " ")
  19. new_file.close()
  20.  
  21. tokenisation("./txt/English/-10-000-gold-.txt","-10-000-gold-")
  22.  
  23. from nltk.stem import WordNetLemmatizer
  24.  
  25. def lemmatization(file,name):
  26. file_content = open(file)
  27. lemmatizer = WordNetLemmatizer()
  28. new_file = open("./lem/English/"+name+".lem","w")
  29. for line in file_content:
  30. for word in line.split():
  31. new_lem = lemmatizer.lemmatize(word)
  32. new_file.write(new_lem+ " ")
  33. new_file.close()
  34.  
  35. #lemmatization("./tok/English/-10-000-gold-.tok","-10-000-gold-")
  36.  
  37. def pos_tagging(file,name):
  38. file_content = open(file)
  39. tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')
  40. for line in file_content:
  41. tags = tagger.tag_text(line)
  42. print(tags[0])
  43.  
  44. pos_tagging("./tok/English/-10-000-gold-.tok","-10-000-gold-")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement