Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- import gzip
- import numpy as np
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import accuracy_score
- from sklearn.naive_bayes import MultinomialNB
- def get_path(filename):
- import sys
- import os
- return os.path.join(os.path.dirname(sys.argv[0]), "..", "src", filename)
- def spam_detection(random_state=0, fraction=1.0):
- spam = ''
- ham = ''
- with gzip.open(get_path('spam.txt.gz'), 'rb') as f:
- spam = f.readlines()
- with gzip.open(get_path('ham.txt.gz'), 'r') as f:
- ham = f.readlines()
- spam = np.array(spam[0:int((len(spam)*fraction))])
- ham = np.asarray(ham[0:int((len(ham)*fraction))])
- print(spam.shape)
- print(ham.shape)
- spam_features = np.repeat(0, spam.shape[0])
- ham_features = np.repeat(1, ham.shape[0])
- y = np.concatenate([spam_features, ham_features])
- X = np.concatenate([spam,ham])
- print(y)
- print(X)
- vec = CountVectorizer()
- features = vec.fit_transform(X)
- X_train, X_test, y_train, y_test = train_test_split(features, y, train_size = 0.75, random_state=random_state)
- model = MultinomialNB()
- model.fit(X_train, y_train)
- y_pred = model.predict(X_test)
- ac = accuracy_score(y_test, y_pred)
- fc = len(y_test) * (1-ac)
- print(ac)
- return ac, len(y_test) , fc
- def main():
- accuracy, total, misclassified = spam_detection(0,0.1)
- print("Accuracy score:", accuracy)
- print("%i messages miclassified out of %i" % (misclassified, total))
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement