Guest User

Untitled

a guest
Jan 10th, 2018
58
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.69 KB | None | 0 0
  1. import nltk
  2.  
  3. class Word:
  4.  
  5. def __init__(self, posi, token, stem, pos):
  6. self.__token = token
  7. self.__stem = stem
  8. self.__pos = pos
  9. self.__posi = posi
  10.  
  11. def get_token(self):
  12. return self.__token
  13.  
  14. def get_stem(self):
  15. return self.__stem
  16.  
  17. def get_pos(self):
  18. return self.__pos
  19.  
  20. def get_posi(self):
  21. return self.__posi
  22.  
  23.  
  24. def annotate_corpus_with_nltk_pipeline(input_filename, language):
  25. '''Liest einen Text ein und speichert diesen annotiert in einem Dictionary mit den Wörtern als Instanzen.'''
  26. with open(input_filename, 'r', encoding='utf8') as f:
  27. read_text = f.read()
  28.  
  29. dic_text = {}
  30. list_text = nltk.sent_tokenize(read_text, language=language)
  31. for line in list_text:
  32. word_list_obj = []
  33. word_list = nltk.word_tokenize(line, language=language)
  34. pos_tag_tuples_list = nltk.pos_tag(word_list, tagset=None, lang=language[0:3])
  35. for posi,mtuple in enumerate(pos_tag_tuples_list):
  36. if mtuple[1] == 'CD' or mtuple[1].startswith('N'):
  37. wn_pos_tag = 'n'
  38. elif mtuple[1] == 'MD' or mtuple[1].startswith('V'):
  39. wn_pos_tag = 'v'
  40. elif mtuple[1].startswith('J'):
  41. wn_pos_tag = 'a'
  42. elif mtuple[1].startswith('R'):
  43. wn_pos_tag = 'r'
  44. else:
  45. wn_pos_tag = 'n'
  46. stem = nltk.stem.WordNetLemmatizer.lemmatize(self=None, word=mtuple[0], pos=wn_pos_tag)
  47. word = Word(posi, mtuple[0], stem, mtuple[1])
  48. word_list_obj.append(word)
  49. dic_text[list_text.index(line)] = word_list_obj
  50. return dic_text
Advertisement
Add Comment
Please, Sign In to add comment