Untitled

import nltk
from nltk.corpus import stopwords
from pymystem3 import Mystem

numbers = ['ноль', 'один', 'два', 'три', 'четыре', 'пять', 'шесть', 'семь', 'восемь', 'девять']
symbols = {'%': 'процент', '$': 'доллар', '-': 'минус', '+': 'плюс'}
stop_words = set(
stopwords.words('russian') + ['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', '–', 'к', 'на', '...'])
mapping = {'A': 'ADJ', 'ADV': 'ADV', 'ADVPRO': 'ADV', 'ANUM': 'ADJ', 'APRO': 'DET', 'COM': 'ADJ', 'CONJ': 'SCONJ',
               'INTJ': 'INTJ', 'NONLEX': 'X', 'NUM': 'NUM', 'PART': 'PART', 'PR': 'ADP', 'S': 'NOUN', 'SPRO': 'PRON',
               'UNKN': 'X', 'V': 'VERB'}
cores = mp.cpu_count()
mystem = Mystem()

def change_symbols_and_filter(self, text: str) -> str:
       result = text
       for i in range(len(self.numbers)):
           result = re.sub(str(i), ' ' + self.numbers[i] + ' ', result)
       for key in self.symbols.keys():
           try:
               result = re.sub(key, ' ' + self.symbols[key] + ' ', result)
           except:
               pass
       result = list(filter(lambda word: word not in self.stop_words, re.findall(r'[А-я]+', result)))
       return ' '.join(result)

def tag_mystem(self, text: str) -> list:
        processed = self.mystem.analyze(text)
        tagged = []
        for w in processed:
            try:
                lemma = w["analysis"][0]["lex"].strip().lower()
                if lemma not in self.stop_words:
                    pos = w["analysis"][0]["gr"].split(',')[0]
                    pos = pos.split('=')[0].strip().upper()
                    if pos in self.mapping:
                        tagged.append(lemma + '_' + self.mapping[pos])
                    else:
                        tagged.append(lemma + '_X')
            except KeyError:
                continue
            except IndexError:
                continue
        return tagged

def tokenize_ru(self, text):
    tokens = [token for token in self.mystem.lemmatize(text) if token not in self.stop_words and token != " "]
    return tokens

def normalization(self, text: str) -> str:
        return ' '.join(self.tag_mystem(self.change_symbols_and_filter(text)))