Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.feature_extraction.text import TfidfTransformer
- from sklearn.naive_bayes import MultinomialNB
- from sklearn.pipeline import Pipeline
- from sklearn.linear_model import SGDClassifier
- from sklearn.model_selection import train_test_split
- import nltk
- import string
- from preprocessing import *
- import numpy as np
- def create_submission(preds, filename, ids):
- """
- :param preds: predictions
- :param filename: target csb file name
- :param ids: test ids
- :return:
- """
- preds_data = {"id": [],
- "EAP": preds[:, 0],
- "HPL": preds[:, 1],
- "MWS": preds[:, 2]}
- df = pd.DataFrame(preds_data, columns=["EAP", "HPL", "MWS"], index=ids)
- df.to_csv(filename)
- def get_additional_features(data):
- new_df = data.copy()
- eng_stopwords = set(nltk.corpus.stopwords.words("english"))
- new_df["words"] = new_df["text"].apply(lambda text: text.split())
- # Num words
- new_df["num_words"] = new_df["words"].apply(lambda words: len(words))
- # Num unique words
- new_df["num_unique_words"] = new_df["words"].apply(lambda words: len(set(words)))
- # Num stopwords
- new_df["num_stopwords"] = new_df["words"].apply(lambda words: len([w for w in words if w in eng_stopwords]))
- # Num punctuation
- new_df["num_punctuations"] = new_df["text"].apply(lambda text: len([c for c in text if c in string.punctuation]))
- # Num words upper
- new_df["num_words_upper"] = new_df["words"].apply(lambda words: len([w for w in words if w.isupper()]))
- # Num words title
- new_df["num_words_title"] = new_df["words"].apply(lambda words: len([w for w in words if w.istitle()]))
- # Mean word length
- new_df["mean_word_len"] = new_df["words"].apply(lambda words: np.mean([len(w) for w in words]))
- return new_df
- if __name__ == "__main__":
- train_df = create_df("train")
- extended_df = get_additional_features(train_df)
- extended_df.text = extended_df.text.apply(clean_text)
- extended_df.text = extended_df.text.apply(lambda row: lemmatize_text(row))
- authors_vocab = {"EAP": 0, "HPL": 1, "MWS": 2}
- y = []
- for i in range(len(train_df.author)):
- y.append(authors_vocab[train_df.author[i]])
- y = np.array(y)
- text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
- ('tfidf', TfidfTransformer()),
- ('clf-svm', SGDClassifier(loss='log', penalty='l2',
- alpha=1e-4, n_iter=15, random_state=33)),
- ])
- X = train_df.text.values
- print(y.shape, X.shape)
- text_clf_svm.fit(X, y)
- test_df = create_df("test")
- # test_df.text = test_df.text.apply(clean_text)
- test_df.text = test_df.text.apply(lambda row: lemmatize_text(row))
- X_pred = test_df.text.values
- result = text_clf_svm.predict_proba(X_pred)
- ids_ = test_df.id
- print(result[:5])
- create_submission(result, "nb_preds.csv", ids_)
- # 'clf-svm', SGDClassifier(loss='log', penalty='l2',
- # alpha=1e-4, n_iter=15, random_state=33))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement