Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Exercice 1 -
- #1 - Tokenisation = separer la ponctuation des mots
- import nltk
- import pprint
- import treetaggerwrapper
- def tokenisation(file,name):
- file_content = open(file,encoding="utf8")
- tokens = nltk.word_tokenize(file_content.read())
- new_file = open("./tok/English/"+name+".tok",'w')
- print("test")
- print(file_content.read())
- for line in file_content.read():
- print(line)
- tokens = nltk.word_tokenize(line)
- print(tokens)
- for word in tokens:
- new_file.write(word+ " ")
- new_file.close()
- tokenisation("./txt/English/-10-000-gold-.txt","-10-000-gold-")
- from nltk.stem import WordNetLemmatizer
- def lemmatization(file,name):
- file_content = open(file)
- lemmatizer = WordNetLemmatizer()
- new_file = open("./lem/English/"+name+".lem","w")
- for line in file_content:
- for word in line.split():
- new_lem = lemmatizer.lemmatize(word)
- new_file.write(new_lem+ " ")
- new_file.close()
- #lemmatization("./tok/English/-10-000-gold-.tok","-10-000-gold-")
- def pos_tagging(file,name):
- file_content = open(file)
- tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')
- for line in file_content:
- tags = tagger.tag_text(line)
- print(tags[0])
- pos_tagging("./tok/English/-10-000-gold-.tok","-10-000-gold-")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement