Advertisement
Guest User

Untitled

a guest
Dec 7th, 2017
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.18 KB | None | 0 0
  1. from sklearn.feature_extraction.text import CountVectorizer
  2. from sklearn.feature_extraction.text import TfidfTransformer
  3. from sklearn.naive_bayes import MultinomialNB
  4. from sklearn.pipeline import Pipeline
  5. from sklearn.linear_model import SGDClassifier
  6. from sklearn.model_selection import train_test_split
  7. import nltk
  8. import string
  9. from preprocessing import *
  10. import numpy as np
  11.  
  12. def create_submission(preds, filename, ids):
  13. """
  14.  
  15. :param preds: predictions
  16. :param filename: target csb file name
  17. :param ids: test ids
  18. :return:
  19. """
  20. preds_data = {"id": [],
  21. "EAP": preds[:, 0],
  22. "HPL": preds[:, 1],
  23. "MWS": preds[:, 2]}
  24. df = pd.DataFrame(preds_data, columns=["EAP", "HPL", "MWS"], index=ids)
  25. df.to_csv(filename)
  26.  
  27. def get_additional_features(data):
  28. new_df = data.copy()
  29. eng_stopwords = set(nltk.corpus.stopwords.words("english"))
  30.  
  31. new_df["words"] = new_df["text"].apply(lambda text: text.split())
  32.  
  33. # Num words
  34. new_df["num_words"] = new_df["words"].apply(lambda words: len(words))
  35.  
  36. # Num unique words
  37. new_df["num_unique_words"] = new_df["words"].apply(lambda words: len(set(words)))
  38.  
  39. # Num stopwords
  40. new_df["num_stopwords"] = new_df["words"].apply(lambda words: len([w for w in words if w in eng_stopwords]))
  41.  
  42. # Num punctuation
  43. new_df["num_punctuations"] = new_df["text"].apply(lambda text: len([c for c in text if c in string.punctuation]))
  44.  
  45. # Num words upper
  46. new_df["num_words_upper"] = new_df["words"].apply(lambda words: len([w for w in words if w.isupper()]))
  47.  
  48. # Num words title
  49. new_df["num_words_title"] = new_df["words"].apply(lambda words: len([w for w in words if w.istitle()]))
  50.  
  51. # Mean word length
  52. new_df["mean_word_len"] = new_df["words"].apply(lambda words: np.mean([len(w) for w in words]))
  53.  
  54. return new_df
  55.  
  56.  
  57. if __name__ == "__main__":
  58. train_df = create_df("train")
  59. extended_df = get_additional_features(train_df)
  60. extended_df.text = extended_df.text.apply(clean_text)
  61. extended_df.text = extended_df.text.apply(lambda row: lemmatize_text(row))
  62.  
  63.  
  64. authors_vocab = {"EAP": 0, "HPL": 1, "MWS": 2}
  65. y = []
  66. for i in range(len(train_df.author)):
  67. y.append(authors_vocab[train_df.author[i]])
  68. y = np.array(y)
  69.  
  70. text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
  71. ('tfidf', TfidfTransformer()),
  72. ('clf-svm', SGDClassifier(loss='log', penalty='l2',
  73. alpha=1e-4, n_iter=15, random_state=33)),
  74. ])
  75. X = train_df.text.values
  76. print(y.shape, X.shape)
  77. text_clf_svm.fit(X, y)
  78.  
  79. test_df = create_df("test")
  80. # test_df.text = test_df.text.apply(clean_text)
  81. test_df.text = test_df.text.apply(lambda row: lemmatize_text(row))
  82. X_pred = test_df.text.values
  83.  
  84. result = text_clf_svm.predict_proba(X_pred)
  85. ids_ = test_df.id
  86. print(result[:5])
  87. create_submission(result, "nb_preds.csv", ids_)
  88.  
  89. # 'clf-svm', SGDClassifier(loss='log', penalty='l2',
  90. # alpha=1e-4, n_iter=15, random_state=33))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement