Advertisement
Guest User

Untitled

a guest
Jul 20th, 2019
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.16 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. import pandas as pd
  3. import numpy as np
  4. import nltk
  5. import pymorphy2
  6. from nltk.corpus import stopwords
  7.  
  8. def Normalize(text, stopwords, morph):
  9.     text = [morph.parse(word)[0].normal_form for word in nltk.word_tokenize(language='russian',text=text.decode('utf8')) if word.lower() not in stopwords and word.isalnum()]
  10.     return ' '.join(text)
  11.  
  12. data = pd.read_csv('./lenta.csv')
  13. data.drop(['url', 'topic/rubric', 'edition', 'topics'], axis=1, inplace=True)
  14. data.rename(columns={'1914-lenta/': 'date_of_news'}, inplace=True)
  15. data.drop(index = data[data.text == 'text'].index, inplace = True)
  16. data.drop(index = data[data.text.transform(type) != str].index, inplace = True)
  17.  
  18. morph = pymorphy2.MorphAnalyzer()
  19. stopwords = set(stopwords.words('russian') + [u'тот', u'те', u'та', u'то', u'этот', u'эти', u'эта', u'это',u'он', u'она', u'оно', u'наш', u'ваш', u'быть', u'мы', u'ты', u'вы', u'там', u'свой', u'своя', u'своё', u'ещё'])
  20.  
  21. print 'Success start'
  22. data = data.text.apply(lambda x: Normalize(x, stopwords, morph))
  23. print 'Saving...'
  24. data.to_csv('clear_data.csv')
  25. print 'Success end'
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement