Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import pandas as pd
- import numpy as np
- import nltk
- import pymorphy2
- from nltk.corpus import stopwords
- def Normalize(text, stopwords, morph):
- text = [morph.parse(word)[0].normal_form for word in nltk.word_tokenize(language='russian',text=text.decode('utf8')) if word.lower() not in stopwords and word.isalnum()]
- return ' '.join(text)
- data = pd.read_csv('./lenta.csv')
- data.drop(['url', 'topic/rubric', 'edition', 'topics'], axis=1, inplace=True)
- data.rename(columns={'1914-lenta/': 'date_of_news'}, inplace=True)
- data.drop(index = data[data.text == 'text'].index, inplace = True)
- data.drop(index = data[data.text.transform(type) != str].index, inplace = True)
- morph = pymorphy2.MorphAnalyzer()
- stopwords = set(stopwords.words('russian') + [u'тот', u'те', u'та', u'то', u'этот', u'эти', u'эта', u'это',u'он', u'она', u'оно', u'наш', u'ваш', u'быть', u'мы', u'ты', u'вы', u'там', u'свой', u'своя', u'своё', u'ещё'])
- print 'Success start'
- data = data.text.apply(lambda x: Normalize(x, stopwords, morph))
- print 'Saving...'
- data.to_csv('clear_data.csv')
- print 'Success end'
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement