Advertisement
Guest User

Untitled

a guest
Nov 21st, 2017
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.06 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import re
  4. import nltk
  5.  
  6. import pandas as pd
  7. import numpy as np
  8.  
  9. from bs4 import BeautifulSoup
  10. from nltk.corpus import stopwords
  11.  
  12.  
  13. class KaggleWord2VecUtility(object):
  14. """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""
  15.  
  16. @staticmethod
  17. def review_to_wordlist( review, remove_stopwords=True ):
  18. # Function to convert a document to a sequence of words,
  19. # optionally removing stop words. Returns a list of words.
  20. #
  21. # 1. Remove HTML
  22. review_text = BeautifulSoup(review).get_text()
  23. #
  24. # 2. Remove non-letters
  25. review_text = re.sub("[^а-яА-Я]"," ", review_text)
  26. #
  27. # 3. Convert words to lower case and split them
  28. words = review_text.lower().split()
  29. #
  30. # 4. Optionally remove stop words (false by default)
  31. if remove_stopwords:
  32. stops = set(stopwords.words("russian"))
  33. stopwords1 = pd.read_csv("stopwords_ru.csv")
  34. words = [w for w in words if not w in stopwords1]#ДОДЕЛАТЬ СТОП СЛОВА и ДРУГУЮ ПРЕДОБРАБОТКУ
  35. #
  36. # 5. Return a list of words
  37. return(words)
  38.  
  39. # Define a function to split a review into parsed sentences
  40. @staticmethod
  41. def review_to_sentences( review, tokenizer, remove_stopwords=False ):
  42. # Function to split a review into parsed sentences. Returns a
  43. # list of sentences, where each sentence is a list of words
  44. #
  45. # 1. Use the NLTK tokenizer to split the paragraph into sentences
  46. raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
  47. #
  48. # 2. Loop over each sentence
  49. sentences = []
  50. for raw_sentence in raw_sentences:
  51. # If a sentence is empty, skip it
  52. if len(raw_sentence) > 0:
  53. # Otherwise, call review_to_wordlist to get a list of words
  54. sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
  55. remove_stopwords ))
  56. #
  57. # Return the list of sentences (each sentence is a list of words,
  58. # so this returns a list of lists
  59. return sentences
  60.  
  61.  
  62.  
  63.  
  64. import os
  65. import pandas as pd
  66. import numpy as np
  67.  
  68. from sklearn.cross_validation import train_test_split
  69. from sklearn.feature_extraction.text import TfidfVectorizer
  70. from sklearn.linear_model import LogisticRegressionCV as LR
  71. from sklearn.svm import SVC
  72. from sklearn.metrics import roc_auc_score as AUC
  73.  
  74. data = pd.read_csv("app_review_rating_train.csv") #Читаем
  75.  
  76. data['Date'] = data.Date.str[:7]#Удалили дни
  77.  
  78. categorical_columns = ["Date","AppName","Language","Version"]#Бинаризация
  79. for cc in categorical_columns:
  80. dummies = pd.get_dummies(data[cc], drop_first=False)
  81. dummies = dummies.add_prefix("{}#".format(cc))
  82. data.drop(cc, axis=1, inplace=True)
  83. data = data.join(dummies)
  84.  
  85. data.Title = data.Title.fillna(" ", axis=0)
  86. data.Review = data.Review.fillna(" ", axis=0)
  87.  
  88. data['Title'] = data['Title'] + " " + data.Review.str[:]#Слияние Rewiew -> Title
  89. data = data.drop(("Review"), axis=1)#Удалили Review
  90. data = data.rename(columns={'Title': 'Text'})#переименовали Title -> Text
  91. #print(data.Text[38])
  92. #data = data.dropna(axis=0)
  93. #data.count(axis=0)
  94.  
  95. train_i, test_i = train_test_split( np.arange( len( data)), train_size = 0.8, random_state = 44 )
  96.  
  97. train = data.ix[train_i]
  98. test = data.ix[test_i]
  99.  
  100.  
  101. print("Parsing train reviews...")
  102.  
  103. clean_train_reviews = []
  104. i = 0
  105. for review in train['Text']:
  106. #print(review)
  107. #print("====================================================/n")
  108. if(pd.isnull(review)):
  109. i = i + 1
  110. print(i)
  111. else:
  112. clean_train_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review )))
  113.  
  114. print("Parsing test reviews...")
  115. i = 0
  116.  
  117. clean_test_reviews = []
  118. for review in test['Text']:
  119. if(pd.isnull(review)):
  120. i = i + 1
  121. print(i)
  122. else:
  123. clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review )))
  124.  
  125. #
  126.  
  127. print("Vectorizing...")
  128.  
  129. vectorizer = TfidfVectorizer( max_features = 40000, ngram_range = ( 1, 3 ),
  130. sublinear_tf = True )
  131.  
  132. train_data_features = vectorizer.fit_transform( clean_train_reviews )
  133. test_data_features = vectorizer.transform( clean_test_reviews )
  134.  
  135. # let's define a helper function
  136.  
  137. def train_and_eval_auc( model, train_x, train_y, test_x, test_y ):
  138. model.fit( train_x, train_y )
  139. p = model.predict_proba( test_x )
  140. print(p)
  141. auc = AUC( test_y, p[:,1] )
  142. return auc
  143.  
  144. #
  145.  
  146. lr = LR(multi_class = "multinomial")
  147. clf = SVC(probability = True)
  148. #sklearn.linear_model.LogisticRegression (setting multi_class=”multinomial”)
  149. #sklearn.linear_model.LogisticRegressionCV (setting multi_class=”multinomial”)
  150. auc = train_and_eval_auc( clf, train_data_features, train["Rating"], \
  151. test_data_features, test["Rating"].values )
  152. print("logistic regression AUC:", auc)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement