Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import re
- import nltk
- import pandas as pd
- import numpy as np
- from bs4 import BeautifulSoup
- from nltk.corpus import stopwords
- class KaggleWord2VecUtility(object):
- """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""
- @staticmethod
- def review_to_wordlist( review, remove_stopwords=True ):
- # Function to convert a document to a sequence of words,
- # optionally removing stop words. Returns a list of words.
- #
- # 1. Remove HTML
- review_text = BeautifulSoup(review).get_text()
- #
- # 2. Remove non-letters
- review_text = re.sub("[^а-яА-Я]"," ", review_text)
- #
- # 3. Convert words to lower case and split them
- words = review_text.lower().split()
- #
- # 4. Optionally remove stop words (false by default)
- if remove_stopwords:
- stops = set(stopwords.words("russian"))
- stopwords1 = pd.read_csv("stopwords_ru.csv")
- words = [w for w in words if not w in stopwords1]#ДОДЕЛАТЬ СТОП СЛОВА и ДРУГУЮ ПРЕДОБРАБОТКУ
- #
- # 5. Return a list of words
- return(words)
- # Define a function to split a review into parsed sentences
- @staticmethod
- def review_to_sentences( review, tokenizer, remove_stopwords=False ):
- # Function to split a review into parsed sentences. Returns a
- # list of sentences, where each sentence is a list of words
- #
- # 1. Use the NLTK tokenizer to split the paragraph into sentences
- raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
- #
- # 2. Loop over each sentence
- sentences = []
- for raw_sentence in raw_sentences:
- # If a sentence is empty, skip it
- if len(raw_sentence) > 0:
- # Otherwise, call review_to_wordlist to get a list of words
- sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
- remove_stopwords ))
- #
- # Return the list of sentences (each sentence is a list of words,
- # so this returns a list of lists
- return sentences
- import os
- import pandas as pd
- import numpy as np
- from sklearn.cross_validation import train_test_split
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.linear_model import LogisticRegressionCV as LR
- from sklearn.svm import SVC
- from sklearn.metrics import roc_auc_score as AUC
- data = pd.read_csv("app_review_rating_train.csv") #Читаем
- data['Date'] = data.Date.str[:7]#Удалили дни
- categorical_columns = ["Date","AppName","Language","Version"]#Бинаризация
- for cc in categorical_columns:
- dummies = pd.get_dummies(data[cc], drop_first=False)
- dummies = dummies.add_prefix("{}#".format(cc))
- data.drop(cc, axis=1, inplace=True)
- data = data.join(dummies)
- data.Title = data.Title.fillna(" ", axis=0)
- data.Review = data.Review.fillna(" ", axis=0)
- data['Title'] = data['Title'] + " " + data.Review.str[:]#Слияние Rewiew -> Title
- data = data.drop(("Review"), axis=1)#Удалили Review
- data = data.rename(columns={'Title': 'Text'})#переименовали Title -> Text
- #print(data.Text[38])
- #data = data.dropna(axis=0)
- #data.count(axis=0)
- train_i, test_i = train_test_split( np.arange( len( data)), train_size = 0.8, random_state = 44 )
- train = data.ix[train_i]
- test = data.ix[test_i]
- print("Parsing train reviews...")
- clean_train_reviews = []
- i = 0
- for review in train['Text']:
- #print(review)
- #print("====================================================/n")
- if(pd.isnull(review)):
- i = i + 1
- print(i)
- else:
- clean_train_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review )))
- print("Parsing test reviews...")
- i = 0
- clean_test_reviews = []
- for review in test['Text']:
- if(pd.isnull(review)):
- i = i + 1
- print(i)
- else:
- clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review )))
- #
- print("Vectorizing...")
- vectorizer = TfidfVectorizer( max_features = 40000, ngram_range = ( 1, 3 ),
- sublinear_tf = True )
- train_data_features = vectorizer.fit_transform( clean_train_reviews )
- test_data_features = vectorizer.transform( clean_test_reviews )
- # let's define a helper function
- def train_and_eval_auc( model, train_x, train_y, test_x, test_y ):
- model.fit( train_x, train_y )
- p = model.predict_proba( test_x )
- print(p)
- auc = AUC( test_y, p[:,1] )
- return auc
- #
- lr = LR(multi_class = "multinomial")
- clf = SVC(probability = True)
- #sklearn.linear_model.LogisticRegression (setting multi_class=”multinomial”)
- #sklearn.linear_model.LogisticRegressionCV (setting multi_class=”multinomial”)
- auc = train_and_eval_auc( clf, train_data_features, train["Rating"], \
- test_data_features, test["Rating"].values )
- print("logistic regression AUC:", auc)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement