Typhoon

Tokenize and Lemmatize text

Feb 2nd, 2016
137
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.71 KB | None | 0 0
  1. #!/usr/local/bin/python3
  2. # -*- coding: utf-8 -*-
  3.  
  4. import nltk
  5. import requests
  6.  
  7. file = open("myfile.txt", "r")
  8. obsah = file.read()
  9.  
  10. print(obsah)
  11. print("############")
  12.  
  13. def lemmatize(word):
  14.     word=str(word).encode(encoding='utf-8')
  15.     url = 'https://my.server.cz:9200/test_index/_analyze?analyzer=lematizer_custom'
  16.     r = requests.post(url, auth=('username', 'password'), timeout=(3, 10), data=word)
  17.     data = r.json()
  18.     lemma = data['tokens'][0]['token'] # Consider lemmatizing whole sentences, without number but iterating in result.
  19.     return lemma
  20.  
  21. tokens = nltk.word_tokenize(obsah)
  22. print(tokens)
  23. print("\n")
  24.  
  25. for token in tokens :
  26.     try:
  27.         print(token, " ", lemmatize(token))
  28.  
  29.     except:
  30.         print("bad token")
Add Comment
Please, Sign In to add comment