Advertisement
PoulYakov

Untitled

Oct 1st, 2022
727
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.66 KB | None | 0 0
  1. from natasha import (
  2.     Segmenter,
  3.    
  4.     NewsEmbedding,
  5.     NewsMorphTagger,
  6.     NewsSyntaxParser,
  7.     MorphVocab,
  8.     Doc
  9. )
  10. segmenter = Segmenter()
  11. emb = NewsEmbedding()
  12. morph_tagger = NewsMorphTagger(emb)
  13. syntax_parser = NewsSyntaxParser(emb)
  14.  
  15. import re
  16.  
  17.  
  18.  
  19. def get_standart_text(text):
  20.     text = text.lower()
  21.     text = ' '.join(re.findall('[а-яё0-9a-z]+', text))
  22.     doc = Doc(text)
  23.     doc.segment(segmenter)
  24.     doc.tag_morph(morph_tagger)
  25.     doc.parse_syntax(syntax_parser)
  26.  
  27.     morph_vocab = MorphVocab()
  28.     for token in doc.tokens:
  29.         token.lemmatize(morph_vocab)
  30.     d = [i.lemma for i in doc.tokens]
  31.  
  32.     return " ".join(d)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement