Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Author: Kairat Tilegen
- #Email: kairat.tilegen@is.sdu.edu.kz
- import csv # is used to read csv file
- import shlex # is used to split avoiding unecessary symbols'_'
- import copy # is used to copy whole array to make new Dataset
- import numpy as np
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.cross_validation import train_test_split
- from sklearn.metrics import accuracy_score
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.linear_model import LogisticRegression, LinearRegression
- from sklearn import linear_model
- import matplotlib.pyplot as plt
- from sklearn import metrics
- import pandas as pd
- from sklearn.naive_bayes import GaussianNB
- import random # For colors
- ##############################################################################################################################################
- #Working with Data Converting Words to Numeric using Dictionary
- #Reading Banking.csv file and implementing it
- filename = '/users/kairat/Desktop/bank.csv'
- #job
- f2 = {"admin.":0,"unknown":1,"unemployed":2,"management":3,"housemaid":4,"entrepreneur":5,"student":6,"blue-collar":7,"self-employed":8,"retired":9,"technician":10,"services":11}
- #marital
- f3 = {"married":0,"divorced":1,"single":2}
- #education
- f4 ={ "unknown":0,"secondary":1,"primary":2,"tertiary":3}
- #default
- f5 = { 'no':1,'yes':0}
- #housing
- f7 = {'no':1,'yes':0}
- #loan
- f8 = {'no':1,'yes':0}
- #contact
- f9 = {'unknown':0,'cellular':1,'telephone':2}
- #month
- f11 = {"jan":0, "feb":1, "mar":2, "apr":3,"may":4,"jun":5,"jul":6,"aug":7,"sep":8,"oct":9, "nov":10, "dec":11}
- #poutcome
- f16 = { "unknown":0,"other":1,"failure":2,"success":3}
- #y
- f17 = {"yes":0,"no":1 }
- with open(filename, 'rb') as f:
- reader = csv.reader(f)#Read via csv built-in function
- dataset=[]
- data=[]
- target=[]
- try:
- for row in reader:
- line=shlex.split(row[0])#Get rid of spaces other symbols..
- dataset.append(line[0].split(';'))
- except csv.Error as e:
- sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
- features=[]
- for i in dataset[0]:
- features.append(i)
- j=0
- #TRANSFORMING INFORMATION TO DATA AND TARGET
- for i in range(1,len(dataset)):
- data.append([])
- data[j].append(float(dataset[i][0]))
- data[j].append(float(f2[dataset[i][1]]))
- data[j].append(float(f3[dataset[i][2]]))
- data[j].append(float(f4[dataset[i][3]]))
- data[j].append(float(f5[dataset[i][4]]))
- data[j].append(float(dataset[i][5]))
- data[j].append(float(f7[dataset[i][6]]))
- data[j].append(float(f8[dataset[i][7]]))
- data[j].append(float(f9[dataset[i][8]]))
- data[j].append(float(dataset[i][9]))
- data[j].append(float(f11[dataset[i][10]]))
- data[j].append(float(dataset[i][11]))
- data[j].append(float(dataset[i][12]))
- data[j].append(float(dataset[i][13]))
- data[j].append(float(dataset[i][14]))
- data[j].append(float(f16[dataset[i][15]]))
- target.append(float(f17[dataset[i][16]]))
- j+=1
- newdata=copy.deepcopy(data)
- for i in range(0,j):
- newdata[i].append(target[i])
- # NEWDATA IS NEW DATASET
- #######################################################################
- # Working with data
- x=np.asarray(data)
- y=np.asarray(target)
- X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.33,random_state=0)
- #######################################################################
- #Getting prepared methods from prediction classes
- #Bayes
- nb=GaussianNB()
- nb.fit(X_train,Y_train)
- nbpred=[]
- #KNN
- knn=KNeighborsClassifier(n_neighbors=3)
- knn.fit(X_train,Y_train)
- knnpred=[]
- #DT
- model = DecisionTreeClassifier(min_samples_split=5)
- model.fit(X_train, Y_train)
- dtpred=[]
- #LR
- logit = LogisticRegression(C=0.5)
- logit.fit(X_train,Y_train)
- logitpred=[]
- #LRM
- regr = linear_model.LogisticRegression()
- regr.fit(x, y)
- lrm=[]
- #########################################
- #preparing DATA for making Predictions
- datass=list(X_test)# CONVERT FROM NP TO LIST
- for i in range(0,len(datass)):
- ndata=datass[i]
- knnpred.append(knn.predict([datass[i]]))
- dtpred.append(model.predict([ndata]))
- nbpred.append(nb.predict([ndata]))
- logitpred.append(logit.predict([ndata]))
- lrm.append(regr.predict([ndata]))
- def manipulation():# GIVING INFORMATION ABOUT HOW DATA WAS MANIPULATED
- for i in range(0,len(datass)):
- print "DATA:",ndata
- print "KNN class predicted :",(knn.predict([ndata]))
- print "KNN probability:",(knn.predict_proba([ndata]))
- print "DT class predicted:",(model.predict([ndata]))
- print "DT proba:",(model.predict_proba([ndata]))
- print "NB class:",(nb.predict([ndata]))
- print "NB probability:",nb.predict_proba([ndata])
- print "Logit class :",(logit.predict([ndata]))
- print "Logit probability:",logit.predict_proba([ndata])
- print "LRM class :",(regr.predict([ndata]))
- print "LRM probability:",regr.predict_proba([ndata])
- def accuracies():
- print "accuracy KNN Algorithm:",accuracy_score(Y_test, knnpred)
- print "accuracy Data Tree:",accuracy_score(Y_test, dtpred)
- print "accuracy Gaussian Normal:",accuracy_score(Y_test,nbpred)
- print "accuracy Logistic Regression:",accuracy_score(Y_test, logitpred)
- print "accuract LRM",accuracy_score(Y_test,lrm)
- def logits():
- print('Coefficients: \n', logit.coef_)
- # Explained variance score: 1 is perfect prediction
- print('Variance: %.2f' % logit.score(X_test, Y_test))
- print pd.DataFrame(zip(features,logit.coef_),columns=['features','EstCoeff'])
- y_pred=logit.predict(X_test)
- print("MSE: %.2f" % (metrics.mean_squared_error(Y_test,y_pred)))
- print("MAE: %.2f" % (metrics.mean_absolute_error(Y_test,y_pred)))
- print("RMSE: %.2f" % (np.sqrt(metrics.mean_squared_error(Y_test,y_pred))))
- # TO FIND THE HIGHEST CORRELATION x>0.6
- indexes=[]
- forplot=[]
- data_frame =pd.DataFrame(newdata)
- data_frame.head()
- data_frame.columns=features
- result = data_frame.y
- columns=[(data_frame.age),(data_frame.job),(data_frame.marital),(data_frame.education),(data_frame.default),(data_frame.balance),(data_frame.housing),(data_frame.loan),(data_frame.contact),(data_frame.day),(data_frame.month),(data_frame.duration),data_frame.campaign,(data_frame.pdays),(data_frame.previous),(data_frame.poutcome),data_frame.y]
- column=[]
- row=[]
- for i in range(0,len(columns)):
- for j in range(i+1,len(columns)-1):
- if columns[i].corr(columns[j])>0.6:
- forplot.append(columns[i].corr(columns[j]))
- column.append(features[i])
- row.append(features[j])
- def corrr():
- for i in range(0,len(columns)):
- for j in range(i+1,len(columns)-1):
- print 'corr btw',features[i],'and',features[j],columns[i].corr(columns[j])
- def high_cor():
- print 'WITH HIGH CORRELATIONS', forplot
- plt.figure(figsize=(45, 15))
- plots = len(forplot)
- ax=[]
- s=0
- f=0
- ##############################
- # Drawing Scatter Plot
- def scatter_plot():
- plt.figure(figsize=(45, 15))
- plots = len(forplot)
- ax=[]
- s=0
- f=0
- for i in range(0,plots):
- ax.append(plt.subplot2grid((5,4), (s,f)))
- f+=1
- ax[i].scatter(data_frame[row[i]],data_frame[column[i]], s=10, c=[random.random(),random.random(),random.random()], marker="o")
- #ax[i].plot(data_frame[row[i]],data_frame[column[i]])
- ax[i].set_ylabel(row[i])
- ax[i].set_xlabel(column[i])
- if (i+1)%4==0:
- s+=1
- f=0
- plt.show()
- inp='lylylylalyalyalyal'
- while inp!=' ':
- print "1 - Manipulation"
- print "2 - Accuracies"
- print '3 - Show Scatter Plot'
- print '4 - High Corrs'
- print '5 - Logit Implementations'
- print '6 - CORRS between features'
- inp=raw_input('Enter The command: ')
- if inp=='1':
- manipulation()
- elif inp=='2':
- accuracies()
- elif inp=='3':
- scatter_plot()
- elif inp=='4':
- high_cor()
- elif inp=='5':
- logits()
- elif inp=='6':
- corrr()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement