Untitled

#!/usr/bin/env python3

import gzip
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import MultinomialNB


def get_path(filename):
    import sys
    import os
    return os.path.join(os.path.dirname(sys.argv[0]), "..", "src", filename)

def spam_detection(random_state=0, fraction=1.0):
    spam = ''
    ham = ''
    with gzip.open(get_path('spam.txt.gz'), 'rb') as f:
        spam = f.readlines()

    with gzip.open(get_path('ham.txt.gz'), 'r') as f:
        ham = f.readlines()

    spam = np.array(spam[0:int((len(spam)*fraction))])
    ham = np.asarray(ham[0:int((len(ham)*fraction))])

    print(spam.shape)
    print(ham.shape)

    spam_features = np.repeat(0, spam.shape[0])
    ham_features = np.repeat(1, ham.shape[0])

    y = np.concatenate([spam_features, ham_features])
    X = np.concatenate([spam,ham])

    print(y)
    print(X)

    vec = CountVectorizer()
    features = vec.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(features, y, train_size = 0.75, random_state=random_state)


    model = MultinomialNB()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    ac = accuracy_score(y_test, y_pred)
    fc = len(y_test) * (1-ac)

    print(ac)


    return ac, len(y_test) , fc

def main():
    accuracy, total, misclassified = spam_detection(0,0.1)
    print("Accuracy score:", accuracy)
    print("%i messages miclassified out of %i" % (misclassified, total))

if __name__ == "__main__":
    main()