Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- class Word:
- def __init__(self, posi, token, stem, pos):
- self.__token = token
- self.__stem = stem
- self.__pos = pos
- self.__posi = posi
- def get_token(self):
- return self.__token
- def get_stem(self):
- return self.__stem
- def get_pos(self):
- return self.__pos
- def get_posi(self):
- return self.__posi
- def annotate_corpus_with_nltk_pipeline(input_filename, language):
- '''Liest einen Text ein und speichert diesen annotiert in einem Dictionary mit den Wörtern als Instanzen.'''
- with open(input_filename, 'r', encoding='utf8') as f:
- read_text = f.read()
- dic_text = {}
- list_text = nltk.sent_tokenize(read_text, language=language)
- for line in list_text:
- word_list_obj = []
- word_list = nltk.word_tokenize(line, language=language)
- pos_tag_tuples_list = nltk.pos_tag(word_list, tagset=None, lang=language[0:3])
- for posi,mtuple in enumerate(pos_tag_tuples_list):
- if mtuple[1] == 'CD' or mtuple[1].startswith('N'):
- wn_pos_tag = 'n'
- elif mtuple[1] == 'MD' or mtuple[1].startswith('V'):
- wn_pos_tag = 'v'
- elif mtuple[1].startswith('J'):
- wn_pos_tag = 'a'
- elif mtuple[1].startswith('R'):
- wn_pos_tag = 'r'
- else:
- wn_pos_tag = 'n'
- stem = nltk.stem.WordNetLemmatizer.lemmatize(self=None, word=mtuple[0], pos=wn_pos_tag)
- word = Word(posi, mtuple[0], stem, mtuple[1])
- word_list_obj.append(word)
- dic_text[list_text.index(line)] = word_list_obj
- return dic_text
Advertisement
Add Comment
Please, Sign In to add comment