Untitled

# -*- coding: utf-8 -*-
"""
Created on Tue May 28 15:04:39 2019

@author: Ishwor Bhusal
"""

# Importing basic packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing dataset to the environment
data = pd.read_csv('TextFiles/smsspamcollection.tsv', sep='\t')
data.head()

# Checking the total count of the null values
data.isnull().sum()

# Counting the values of the label
data['label'].value_counts()

# Importing train test split model
from sklearn.model_selection import train_test_split

# Splitting the data for train and test set
X_data = data['message']
y_data = data['label']

# Running the train test split model
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, random_state=508)

# Running count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()

# FIT VECTORIZER to the data
count_vect.fit(X_train)
X_train_counts = count_vect.transform(X_train)
# transform the original text message
X_train_counts = count_vect.fit_transform(X_train)

# Checking the shape of the data
X_train_counts.shape

# Importing tfi data transformer
from sklearn.feature_extraction.text import TfidataTransformer
tfi_trans = TfidataTransformer()

# Fitting the transformer into training data counts and looking at the shape of the data
X_train_tfidata = tfi_trans.fit_transform(X_train_counts)
X_train_tfidata.shape

# Importing Term-Frequency vectorizer
from sklearn.feature_extraction.text import TfidataVectorizer
vectorizer = TfidataVectorizer()

# Fitting the tfi vectorizer
X_train_tfidata = vectorizer.fit_transform(X_train)

from sklearn.svm import LinearSVC
clf = LinearSVC()

# Fitting linear support vector classifier in vectorized training data
clf.fit(X_train_tfidata, y_train)

# Importing Pipeline module
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidata', TfidataVectorizer()), ('clf', LinearSVC())])

# Fitting pipeline module in traning data
text_clf.fit(X_train, y_train)

# Running prediction model into test data
predictions = text_clf.predict(X_test)

# Importing confusioin matrix and classification report from scikit learn
from sklearn.metrics import confusion_matrix, classification_report
# Preparing and printing confusion matrix to see the performance of the model
print(confusion_matrix(y_test, predictions))
# Looking at classification report
print(classification_report(y_test, predictions))

# importing metrics and looking in each test data and comparing with prediction
from sklearn import metrics
metrics.accuracy_score(y_test, predictions)

# Practically checking the accuracy level of the predictions and looking in real examples
text_clf.predict(["Congratulations! for your achievement what you have got!"])
text_clf.predict(["Mr. Customer, you are eligible to get 50% discount on your purchase of your clothings in your next visit."])
text_clf.predict(["Mom, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Dad, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Brother, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Dear Customer, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Dear Mr. Bhusal, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Mrs. Kala, you are one of those people who have been choose for next round of interviews, congratulations!"])
# Above sentences are those which have been tested if model accurately predicts or not, either message is spam or legit (ham)