SHARE
TWEET

Untitled

a guest Apr 24th, 2019 81 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python3
  2.  
  3. import gzip
  4. import numpy as np
  5. from sklearn.feature_extraction.text import CountVectorizer
  6. from sklearn.model_selection import train_test_split
  7. from sklearn.metrics import accuracy_score
  8.  
  9. from sklearn.naive_bayes import MultinomialNB
  10.  
  11.  
  12. def get_path(filename):
  13.     import sys
  14.     import os
  15.     return os.path.join(os.path.dirname(sys.argv[0]), "..", "src", filename)
  16.  
  17. def spam_detection(random_state=0, fraction=1.0):
  18.     spam = ''
  19.     ham = ''
  20.     with gzip.open(get_path('spam.txt.gz'), 'rb') as f:
  21.         spam = f.readlines()
  22.  
  23.     with gzip.open(get_path('ham.txt.gz'), 'r') as f:
  24.         ham = f.readlines()
  25.  
  26.     spam = np.array(spam[0:int((len(spam)*fraction))])
  27.     ham = np.asarray(ham[0:int((len(ham)*fraction))])
  28.  
  29.     print(spam.shape)
  30.     print(ham.shape)
  31.  
  32.     spam_features = np.repeat(0, spam.shape[0])
  33.     ham_features = np.repeat(1, ham.shape[0])
  34.  
  35.     y = np.concatenate([spam_features, ham_features])
  36.     X = np.concatenate([spam,ham])
  37.  
  38.     print(y)
  39.     print(X)
  40.  
  41.     vec = CountVectorizer()
  42.     features = vec.fit_transform(X)
  43.  
  44.     X_train, X_test, y_train, y_test = train_test_split(features, y, train_size = 0.75, random_state=random_state)
  45.  
  46.  
  47.  
  48.     model = MultinomialNB()
  49.     model.fit(X_train, y_train)
  50.  
  51.     y_pred = model.predict(X_test)
  52.  
  53.     ac = accuracy_score(y_test, y_pred)
  54.     fc = len(y_test) * (1-ac)
  55.  
  56.     print(ac)
  57.  
  58.  
  59.     return ac, len(y_test) , fc
  60.  
  61. def main():
  62.     accuracy, total, misclassified = spam_detection(0,0.1)
  63.     print("Accuracy score:", accuracy)
  64.     print("%i messages miclassified out of %i" % (misclassified, total))
  65.  
  66. if __name__ == "__main__":
  67.     main()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top