Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def spam_detection(random_state=0, fraction=1.0):
- vectorizer = CountVectorizer()
- h = gzip.open(get_path('ham.txt.gz'), 'rb')
- s = gzip.open(get_path('spam.txt.gz'), 'rb')
- hr = h.read()
- sr = s.read()
- ham = hr[0:int(fraction*len(hr))]
- spam = sr[0:int(fraction*len(sr))]
- hammat = vectorizer.fit_transform([ham])
- spammat = vectorizer.fit_transform([spam])
- #print(hammat.shape, spammat.shape)
- X = vectorizer.fit_transform([ham, spam]) #np.vstack([hammat, spammat])
- y = np.hstack([[0]*hammat.shape[0], [1]*spammat.shape[0]]) #[0]*hammat.shape[1] + [1]*spammat.shape[1]
- #print(X.shape, y.shape)
- X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.75)
- model = MultinomialNB()
- model.fit(X_train, y_train)
- y_fitted = model.predict(X_test)
- acc=metrics.accuracy_score(y_test, y_fitted)
- return acc, 0, 0
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement