Untitled

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
import nltk
import string
from preprocessing import *
import numpy as np

def create_submission(preds, filename, ids):
    """

    :param preds: predictions
    :param filename: target csb file name
    :param ids: test ids
    :return:
    """
    preds_data = {"id": [],
                  "EAP": preds[:, 0],
                  "HPL": preds[:, 1],
                  "MWS": preds[:, 2]}
    df = pd.DataFrame(preds_data, columns=["EAP", "HPL", "MWS"], index=ids)
    df.to_csv(filename)

def get_additional_features(data):
    new_df = data.copy()
    eng_stopwords = set(nltk.corpus.stopwords.words("english"))

    new_df["words"] = new_df["text"].apply(lambda text: text.split())

    # Num words
    new_df["num_words"] = new_df["words"].apply(lambda words: len(words))

    # Num unique words
    new_df["num_unique_words"] = new_df["words"].apply(lambda words: len(set(words)))

    # Num stopwords
    new_df["num_stopwords"] = new_df["words"].apply(lambda words: len([w for w in words if w in eng_stopwords]))

    # Num punctuation
    new_df["num_punctuations"] = new_df["text"].apply(lambda text: len([c for c in text if c in string.punctuation]))

    # Num words upper
    new_df["num_words_upper"] = new_df["words"].apply(lambda words: len([w for w in words if w.isupper()]))

    # Num words title
    new_df["num_words_title"] = new_df["words"].apply(lambda words: len([w for w in words if w.istitle()]))

    # Mean word length
    new_df["mean_word_len"] = new_df["words"].apply(lambda words: np.mean([len(w) for w in words]))

    return new_df


if __name__ == "__main__":
    train_df = create_df("train")
    extended_df = get_additional_features(train_df)
    extended_df.text = extended_df.text.apply(clean_text)
    extended_df.text = extended_df.text.apply(lambda row: lemmatize_text(row))


    authors_vocab = {"EAP": 0, "HPL": 1, "MWS": 2}
    y = []
    for i in range(len(train_df.author)):
        y.append(authors_vocab[train_df.author[i]])
    y = np.array(y)

    text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                             ('tfidf', TfidfTransformer()),
                             ('clf-svm', SGDClassifier(loss='log', penalty='l2',
                                alpha=1e-4, n_iter=15, random_state=33)),
                             ])
    X = train_df.text.values
    print(y.shape, X.shape)
    text_clf_svm.fit(X, y)

    test_df = create_df("test")
    # test_df.text = test_df.text.apply(clean_text)
    test_df.text = test_df.text.apply(lambda row: lemmatize_text(row))
    X_pred = test_df.text.values

    result = text_clf_svm.predict_proba(X_pred)
    ids_ = test_df.id
    print(result[:5])
    create_submission(result, "nb_preds.csv", ids_)

    # 'clf-svm', SGDClassifier(loss='log', penalty='l2',
                             # alpha=1e-4, n_iter=15, random_state=33))