Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Tue May 28 15:04:39 2019
- @author: Ishwor Bhusal
- """
- # Importing basic packages
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- # Importing dataset to the environment
- data = pd.read_csv('TextFiles/smsspamcollection.tsv', sep='\t')
- data.head()
- # Checking the total count of the null values
- data.isnull().sum()
- # Counting the values of the label
- data['label'].value_counts()
- # Importing train test split model
- from sklearn.model_selection import train_test_split
- # Splitting the data for train and test set
- X_data = data['message']
- y_data = data['label']
- # Running the train test split model
- X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, random_state=508)
- # Running count vectorizer
- from sklearn.feature_extraction.text import CountVectorizer
- count_vect = CountVectorizer()
- # FIT VECTORIZER to the data
- count_vect.fit(X_train)
- X_train_counts = count_vect.transform(X_train)
- # transform the original text message
- X_train_counts = count_vect.fit_transform(X_train)
- # Checking the shape of the data
- X_train_counts.shape
- # Importing tfi data transformer
- from sklearn.feature_extraction.text import TfidataTransformer
- tfi_trans = TfidataTransformer()
- # Fitting the transformer into training data counts and looking at the shape of the data
- X_train_tfidata = tfi_trans.fit_transform(X_train_counts)
- X_train_tfidata.shape
- # Importing Term-Frequency vectorizer
- from sklearn.feature_extraction.text import TfidataVectorizer
- vectorizer = TfidataVectorizer()
- # Fitting the tfi vectorizer
- X_train_tfidata = vectorizer.fit_transform(X_train)
- from sklearn.svm import LinearSVC
- clf = LinearSVC()
- # Fitting linear support vector classifier in vectorized training data
- clf.fit(X_train_tfidata, y_train)
- # Importing Pipeline module
- from sklearn.pipeline import Pipeline
- text_clf = Pipeline([('tfidata', TfidataVectorizer()), ('clf', LinearSVC())])
- # Fitting pipeline module in traning data
- text_clf.fit(X_train, y_train)
- # Running prediction model into test data
- predictions = text_clf.predict(X_test)
- # Importing confusioin matrix and classification report from scikit learn
- from sklearn.metrics import confusion_matrix, classification_report
- # Preparing and printing confusion matrix to see the performance of the model
- print(confusion_matrix(y_test, predictions))
- # Looking at classification report
- print(classification_report(y_test, predictions))
- # importing metrics and looking in each test data and comparing with prediction
- from sklearn import metrics
- metrics.accuracy_score(y_test, predictions)
- # Practically checking the accuracy level of the predictions and looking in real examples
- text_clf.predict(["Congratulations! for your achievement what you have got!"])
- text_clf.predict(["Mr. Customer, you are eligible to get 50% discount on your purchase of your clothings in your next visit."])
- text_clf.predict(["Mom, come home as soon as possible, someone wants to see you soon!"])
- text_clf.predict(["Dad, come home as soon as possible, someone wants to see you soon!"])
- text_clf.predict(["Brother, come home as soon as possible, someone wants to see you soon!"])
- text_clf.predict(["Dear Customer, come home as soon as possible, someone wants to see you soon!"])
- text_clf.predict(["Dear Mr. Bhusal, come home as soon as possible, someone wants to see you soon!"])
- text_clf.predict(["Mrs. Kala, you are one of those people who have been choose for next round of interviews, congratulations!"])
- # Above sentences are those which have been tested if model accurately predicts or not, either message is spam or legit (ham)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement