Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/local/bin/python3
- # -*- coding: utf-8 -*-
- import nltk
- import requests
- file = open("myfile.txt", "r")
- obsah = file.read()
- print(obsah)
- print("############")
- def lemmatize(word):
- word=str(word).encode(encoding='utf-8')
- url = 'https://my.server.cz:9200/test_index/_analyze?analyzer=lematizer_custom'
- r = requests.post(url, auth=('username', 'password'), timeout=(3, 10), data=word)
- data = r.json()
- lemma = data['tokens'][0]['token'] # Consider lemmatizing whole sentences, without number but iterating in result.
- return lemma
- tokens = nltk.word_tokenize(obsah)
- print(tokens)
- print("\n")
- for token in tokens :
- try:
- print(token, " ", lemmatize(token))
- except:
- print("bad token")
Add Comment
Please, Sign In to add comment