Advertisement
Guest User

Untitled

a guest
Dec 7th, 2019
114
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.26 KB | None | 0 0
  1. import nltk
  2. from nltk.corpus import stopwords
  3. from pymystem3 import Mystem
  4.  
  5. numbers = ['ноль', 'один', 'два', 'три', 'четыре', 'пять', 'шесть', 'семь', 'восемь', 'девять']
  6. symbols = {'%': 'процент', '$': 'доллар', '-': 'минус', '+': 'плюс'}
  7. stop_words = set(
  8. stopwords.words('russian') + ['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', '–', 'к', 'на', '...'])
  9. mapping = {'A': 'ADJ', 'ADV': 'ADV', 'ADVPRO': 'ADV', 'ANUM': 'ADJ', 'APRO': 'DET', 'COM': 'ADJ', 'CONJ': 'SCONJ',
  10.                'INTJ': 'INTJ', 'NONLEX': 'X', 'NUM': 'NUM', 'PART': 'PART', 'PR': 'ADP', 'S': 'NOUN', 'SPRO': 'PRON',
  11.                'UNKN': 'X', 'V': 'VERB'}
  12. cores = mp.cpu_count()
  13. mystem = Mystem()
  14.  
  15. def change_symbols_and_filter(self, text: str) -> str:
  16.        result = text
  17.        for i in range(len(self.numbers)):
  18.            result = re.sub(str(i), ' ' + self.numbers[i] + ' ', result)
  19.        for key in self.symbols.keys():
  20.            try:
  21.                result = re.sub(key, ' ' + self.symbols[key] + ' ', result)
  22.            except:
  23.                pass
  24.        result = list(filter(lambda word: word not in self.stop_words, re.findall(r'[А-я]+', result)))
  25.        return ' '.join(result)
  26.  
  27. def tag_mystem(self, text: str) -> list:
  28.         processed = self.mystem.analyze(text)
  29.         tagged = []
  30.         for w in processed:
  31.             try:
  32.                 lemma = w["analysis"][0]["lex"].strip().lower()
  33.                 if lemma not in self.stop_words:
  34.                     pos = w["analysis"][0]["gr"].split(',')[0]
  35.                     pos = pos.split('=')[0].strip().upper()
  36.                     if pos in self.mapping:
  37.                         tagged.append(lemma + '_' + self.mapping[pos])
  38.                     else:
  39.                         tagged.append(lemma + '_X')
  40.             except KeyError:
  41.                 continue
  42.             except IndexError:
  43.                 continue
  44.         return tagged
  45.  
  46. def tokenize_ru(self, text):
  47.     tokens = [token for token in self.mystem.lemmatize(text) if token not in self.stop_words and token != " "]
  48.     return tokens
  49.  
  50. def normalization(self, text: str) -> str:
  51.         return ' '.join(self.tag_mystem(self.change_symbols_and_filter(text)))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement