Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from natasha import (
- Segmenter,
- NewsEmbedding,
- NewsMorphTagger,
- NewsSyntaxParser,
- MorphVocab,
- Doc
- )
- segmenter = Segmenter()
- emb = NewsEmbedding()
- morph_tagger = NewsMorphTagger(emb)
- syntax_parser = NewsSyntaxParser(emb)
- import re
- def get_standart_text(text):
- text = text.lower()
- text = ' '.join(re.findall('[а-яё0-9a-z]+', text))
- doc = Doc(text)
- doc.segment(segmenter)
- doc.tag_morph(morph_tagger)
- doc.parse_syntax(syntax_parser)
- morph_vocab = MorphVocab()
- for token in doc.tokens:
- token.lemmatize(morph_vocab)
- d = [i.lemma for i in doc.tokens]
- return " ".join(d)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement