daily pastebin goal
55%
SHARE
TWEET

Untitled

a guest Dec 7th, 2017 55 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from sklearn.feature_extraction.text import CountVectorizer
  2. from sklearn.feature_extraction.text import TfidfTransformer
  3. from sklearn.naive_bayes import MultinomialNB
  4. from sklearn.pipeline import Pipeline
  5. from sklearn.linear_model import SGDClassifier
  6. from sklearn.model_selection import train_test_split
  7. import nltk
  8. import string
  9. from preprocessing import *
  10. import numpy as np
  11.  
  12. def create_submission(preds, filename, ids):
  13.     """
  14.  
  15.     :param preds: predictions
  16.     :param filename: target csb file name
  17.     :param ids: test ids
  18.     :return:
  19.     """
  20.     preds_data = {"id": [],
  21.                   "EAP": preds[:, 0],
  22.                   "HPL": preds[:, 1],
  23.                   "MWS": preds[:, 2]}
  24.     df = pd.DataFrame(preds_data, columns=["EAP", "HPL", "MWS"], index=ids)
  25.     df.to_csv(filename)
  26.  
  27. def get_additional_features(data):
  28.     new_df = data.copy()
  29.     eng_stopwords = set(nltk.corpus.stopwords.words("english"))
  30.  
  31.     new_df["words"] = new_df["text"].apply(lambda text: text.split())
  32.  
  33.     # Num words
  34.     new_df["num_words"] = new_df["words"].apply(lambda words: len(words))
  35.  
  36.     # Num unique words
  37.     new_df["num_unique_words"] = new_df["words"].apply(lambda words: len(set(words)))
  38.  
  39.     # Num stopwords
  40.     new_df["num_stopwords"] = new_df["words"].apply(lambda words: len([w for w in words if w in eng_stopwords]))
  41.  
  42.     # Num punctuation
  43.     new_df["num_punctuations"] = new_df["text"].apply(lambda text: len([c for c in text if c in string.punctuation]))
  44.  
  45.     # Num words upper
  46.     new_df["num_words_upper"] = new_df["words"].apply(lambda words: len([w for w in words if w.isupper()]))
  47.  
  48.     # Num words title
  49.     new_df["num_words_title"] = new_df["words"].apply(lambda words: len([w for w in words if w.istitle()]))
  50.  
  51.     # Mean word length
  52.     new_df["mean_word_len"] = new_df["words"].apply(lambda words: np.mean([len(w) for w in words]))
  53.  
  54.     return new_df
  55.  
  56.  
  57. if __name__ == "__main__":
  58.     train_df = create_df("train")
  59.     extended_df = get_additional_features(train_df)
  60.     extended_df.text = extended_df.text.apply(clean_text)
  61.     extended_df.text = extended_df.text.apply(lambda row: lemmatize_text(row))
  62.  
  63.  
  64.     authors_vocab = {"EAP": 0, "HPL": 1, "MWS": 2}
  65.     y = []
  66.     for i in range(len(train_df.author)):
  67.         y.append(authors_vocab[train_df.author[i]])
  68.     y = np.array(y)
  69.  
  70.     text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
  71.                              ('tfidf', TfidfTransformer()),
  72.                              ('clf-svm', SGDClassifier(loss='log', penalty='l2',
  73.                                 alpha=1e-4, n_iter=15, random_state=33)),
  74.                              ])
  75.     X = train_df.text.values
  76.     print(y.shape, X.shape)
  77.     text_clf_svm.fit(X, y)
  78.  
  79.     test_df = create_df("test")
  80.     # test_df.text = test_df.text.apply(clean_text)
  81.     test_df.text = test_df.text.apply(lambda row: lemmatize_text(row))
  82.     X_pred = test_df.text.values
  83.  
  84.     result = text_clf_svm.predict_proba(X_pred)
  85.     ids_ = test_df.id
  86.     print(result[:5])
  87.     create_submission(result, "nb_preds.csv", ids_)
  88.  
  89.     # 'clf-svm', SGDClassifier(loss='log', penalty='l2',
  90.                              # alpha=1e-4, n_iter=15, random_state=33))
RAW Paste Data
Top