Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- from nltk.corpus import stopwords
- from pymystem3 import Mystem
- numbers = ['ноль', 'один', 'два', 'три', 'четыре', 'пять', 'шесть', 'семь', 'восемь', 'девять']
- symbols = {'%': 'процент', '$': 'доллар', '-': 'минус', '+': 'плюс'}
- stop_words = set(
- stopwords.words('russian') + ['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', '–', 'к', 'на', '...'])
- mapping = {'A': 'ADJ', 'ADV': 'ADV', 'ADVPRO': 'ADV', 'ANUM': 'ADJ', 'APRO': 'DET', 'COM': 'ADJ', 'CONJ': 'SCONJ',
- 'INTJ': 'INTJ', 'NONLEX': 'X', 'NUM': 'NUM', 'PART': 'PART', 'PR': 'ADP', 'S': 'NOUN', 'SPRO': 'PRON',
- 'UNKN': 'X', 'V': 'VERB'}
- cores = mp.cpu_count()
- mystem = Mystem()
- def change_symbols_and_filter(self, text: str) -> str:
- result = text
- for i in range(len(self.numbers)):
- result = re.sub(str(i), ' ' + self.numbers[i] + ' ', result)
- for key in self.symbols.keys():
- try:
- result = re.sub(key, ' ' + self.symbols[key] + ' ', result)
- except:
- pass
- result = list(filter(lambda word: word not in self.stop_words, re.findall(r'[А-я]+', result)))
- return ' '.join(result)
- def tag_mystem(self, text: str) -> list:
- processed = self.mystem.analyze(text)
- tagged = []
- for w in processed:
- try:
- lemma = w["analysis"][0]["lex"].strip().lower()
- if lemma not in self.stop_words:
- pos = w["analysis"][0]["gr"].split(',')[0]
- pos = pos.split('=')[0].strip().upper()
- if pos in self.mapping:
- tagged.append(lemma + '_' + self.mapping[pos])
- else:
- tagged.append(lemma + '_X')
- except KeyError:
- continue
- except IndexError:
- continue
- return tagged
- def tokenize_ru(self, text):
- tokens = [token for token in self.mystem.lemmatize(text) if token not in self.stop_words and token != " "]
- return tokens
- def normalization(self, text: str) -> str:
- return ' '.join(self.tag_mystem(self.change_symbols_and_filter(text)))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement