Advertisement
Guest User

Untitled

a guest
Apr 18th, 2019
129
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.93 KB | None | 0 0
  1. def spam_detection(random_state=0, fraction=1.0):
  2. vectorizer = CountVectorizer()
  3. h = gzip.open(get_path('ham.txt.gz'), 'rb')
  4. s = gzip.open(get_path('spam.txt.gz'), 'rb')
  5. hr = h.read()
  6. sr = s.read()
  7. ham = hr[0:int(fraction*len(hr))]
  8. spam = sr[0:int(fraction*len(sr))]
  9. hammat = vectorizer.fit_transform([ham])
  10. spammat = vectorizer.fit_transform([spam])
  11. #print(hammat.shape, spammat.shape)
  12. X = vectorizer.fit_transform([ham, spam]) #np.vstack([hammat, spammat])
  13.  
  14. y = np.hstack([[0]*hammat.shape[0], [1]*spammat.shape[0]]) #[0]*hammat.shape[1] + [1]*spammat.shape[1]
  15. #print(X.shape, y.shape)
  16.  
  17.  
  18. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.75)
  19. model = MultinomialNB()
  20. model.fit(X_train, y_train)
  21. y_fitted = model.predict(X_test)
  22. acc=metrics.accuracy_score(y_test, y_fitted)
  23.  
  24. return acc, 0, 0
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement