Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pymorphy2
- import re
- from nltk.stem.snowball import SnowballStemmer
- from string import ascii_lowercase, punctuation
- stemmer = SnowballStemmer("russian", ignore_stopwords=True)
- morph = pymorphy2.MorphAnalyzer()
- retoken = re.compile(r'[\'\w\-]+')
- def stemmatize(text):
- text = [stemmer.stem(x) for x in text.split()]
- return ' '.join(text)
- def tokenize_normalize(text):
- text = retoken.findall(text.lower())
- text = [morph.parse(x)[0].normal_form for x in text]
- return ' '.join(text)
Add Comment
Please, Sign In to add comment