Advertisement
Guest User

Untitled

a guest
Oct 26th, 2016
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.16 KB | None | 0 0
  1. #Author: Kairat Tilegen
  2. #Email: kairat.tilegen@is.sdu.edu.kz
  3.  
  4. import csv # is used to read csv file
  5. import shlex # is used to split avoiding unecessary symbols'_'
  6. import copy # is used to copy whole array to make new Dataset
  7. import numpy as np
  8. from sklearn.neighbors import KNeighborsClassifier
  9. from sklearn.cross_validation import train_test_split
  10. from sklearn.metrics import accuracy_score
  11. from sklearn.tree import DecisionTreeClassifier
  12. from sklearn.linear_model import LogisticRegression, LinearRegression
  13. from sklearn import linear_model
  14. import matplotlib.pyplot as plt
  15. from sklearn import metrics
  16. import pandas as pd
  17. from sklearn.naive_bayes import GaussianNB
  18. import random # For colors
  19.  
  20. ##############################################################################################################################################
  21.  
  22. #Working with Data Converting Words to Numeric using Dictionary
  23. #Reading Banking.csv file and implementing it
  24. filename = '/users/kairat/Desktop/bank.csv'
  25.  
  26.  #job
  27. f2 = {"admin.":0,"unknown":1,"unemployed":2,"management":3,"housemaid":4,"entrepreneur":5,"student":6,"blue-collar":7,"self-employed":8,"retired":9,"technician":10,"services":11}
  28. #marital    
  29. f3 = {"married":0,"divorced":1,"single":2}
  30. #education    
  31. f4 ={ "unknown":0,"secondary":1,"primary":2,"tertiary":3}
  32.     #default
  33. f5 = { 'no':1,'yes':0}
  34.     #housing    
  35. f7 = {'no':1,'yes':0}
  36.     #loan
  37. f8 = {'no':1,'yes':0}
  38.     #contact
  39. f9 = {'unknown':0,'cellular':1,'telephone':2}
  40.     #month
  41. f11 = {"jan":0, "feb":1, "mar":2, "apr":3,"may":4,"jun":5,"jul":6,"aug":7,"sep":8,"oct":9, "nov":10, "dec":11}
  42.     #poutcome
  43. f16 = { "unknown":0,"other":1,"failure":2,"success":3}
  44.     #y    
  45. f17 = {"yes":0,"no":1 }
  46. with open(filename, 'rb') as f:
  47.     reader = csv.reader(f)#Read via csv built-in function
  48.     dataset=[]            
  49.     data=[]
  50.     target=[]
  51.     try:        
  52.         for row in reader:
  53.             line=shlex.split(row[0])#Get rid of spaces other symbols..
  54.             dataset.append(line[0].split(';'))              
  55.     except csv.Error as e:
  56.         sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
  57.     features=[]
  58.     for i in dataset[0]:
  59.         features.append(i)
  60.     j=0
  61.     #TRANSFORMING INFORMATION TO DATA AND TARGET
  62.     for i in range(1,len(dataset)):
  63.         data.append([])
  64.         data[j].append(float(dataset[i][0]))
  65.         data[j].append(float(f2[dataset[i][1]]))
  66.         data[j].append(float(f3[dataset[i][2]]))
  67.         data[j].append(float(f4[dataset[i][3]]))
  68.         data[j].append(float(f5[dataset[i][4]]))
  69.         data[j].append(float(dataset[i][5]))        
  70.         data[j].append(float(f7[dataset[i][6]]))
  71.         data[j].append(float(f8[dataset[i][7]]))
  72.         data[j].append(float(f9[dataset[i][8]]))
  73.         data[j].append(float(dataset[i][9]))
  74.         data[j].append(float(f11[dataset[i][10]]))
  75.         data[j].append(float(dataset[i][11]))
  76.         data[j].append(float(dataset[i][12]))
  77.         data[j].append(float(dataset[i][13]))
  78.         data[j].append(float(dataset[i][14]))
  79.         data[j].append(float(f16[dataset[i][15]]))      
  80.         target.append(float(f17[dataset[i][16]]))
  81.         j+=1
  82.     newdata=copy.deepcopy(data)
  83.  
  84.     for i in range(0,j):
  85.          newdata[i].append(target[i])
  86.     # NEWDATA IS NEW DATASET   
  87. #######################################################################
  88.  
  89. # Working with data
  90. x=np.asarray(data)
  91. y=np.asarray(target)
  92. X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.33,random_state=0)
  93.  
  94. #######################################################################
  95.  
  96.  
  97. #Getting prepared methods from prediction classes
  98. #Bayes
  99. nb=GaussianNB()
  100. nb.fit(X_train,Y_train)
  101. nbpred=[]
  102. #KNN
  103. knn=KNeighborsClassifier(n_neighbors=3)
  104. knn.fit(X_train,Y_train)
  105. knnpred=[]
  106. #DT
  107. model = DecisionTreeClassifier(min_samples_split=5)
  108. model.fit(X_train, Y_train)
  109. dtpred=[]
  110. #LR
  111. logit = LogisticRegression(C=0.5)
  112. logit.fit(X_train,Y_train)
  113. logitpred=[]
  114. #LRM
  115. regr = linear_model.LogisticRegression()
  116. regr.fit(x, y)
  117. lrm=[]
  118. #########################################
  119. #preparing DATA for making Predictions
  120. datass=list(X_test)# CONVERT FROM NP TO LIST
  121. for i in range(0,len(datass)):
  122.     ndata=datass[i]
  123.     knnpred.append(knn.predict([datass[i]]))
  124.     dtpred.append(model.predict([ndata]))
  125.     nbpred.append(nb.predict([ndata]))
  126.     logitpred.append(logit.predict([ndata]))
  127.     lrm.append(regr.predict([ndata]))
  128.  
  129. def manipulation():# GIVING INFORMATION ABOUT HOW DATA WAS MANIPULATED
  130.     for i in range(0,len(datass)):
  131.         print "DATA:",ndata
  132.         print "KNN class predicted :",(knn.predict([ndata]))
  133.         print "KNN probability:",(knn.predict_proba([ndata]))
  134.         print "DT class predicted:",(model.predict([ndata]))
  135.         print "DT proba:",(model.predict_proba([ndata]))
  136.         print "NB class:",(nb.predict([ndata]))
  137.         print "NB probability:",nb.predict_proba([ndata])
  138.         print "Logit class :",(logit.predict([ndata]))
  139.         print "Logit probability:",logit.predict_proba([ndata])
  140.         print "LRM class :",(regr.predict([ndata]))
  141.         print "LRM probability:",regr.predict_proba([ndata])
  142. def accuracies():
  143.     print "accuracy KNN Algorithm:",accuracy_score(Y_test, knnpred)
  144.     print "accuracy Data Tree:",accuracy_score(Y_test, dtpred)
  145.     print "accuracy Gaussian Normal:",accuracy_score(Y_test,nbpred)
  146.     print "accuracy Logistic Regression:",accuracy_score(Y_test, logitpred)
  147.     print "accuract LRM",accuracy_score(Y_test,lrm)
  148. def logits():
  149.     print('Coefficients: \n', logit.coef_)
  150.     # Explained variance score: 1 is perfect prediction
  151.     print('Variance: %.2f' % logit.score(X_test, Y_test))
  152.  
  153.     print pd.DataFrame(zip(features,logit.coef_),columns=['features','EstCoeff'])
  154.     y_pred=logit.predict(X_test)
  155.     print("MSE: %.2f" % (metrics.mean_squared_error(Y_test,y_pred)))
  156.     print("MAE: %.2f" % (metrics.mean_absolute_error(Y_test,y_pred)))
  157.     print("RMSE: %.2f" % (np.sqrt(metrics.mean_squared_error(Y_test,y_pred))))
  158.  
  159.  
  160. # TO FIND THE HIGHEST CORRELATION x>0.6
  161. indexes=[]
  162. forplot=[]
  163. data_frame =pd.DataFrame(newdata)
  164. data_frame.head()
  165. data_frame.columns=features
  166. result = data_frame.y
  167. columns=[(data_frame.age),(data_frame.job),(data_frame.marital),(data_frame.education),(data_frame.default),(data_frame.balance),(data_frame.housing),(data_frame.loan),(data_frame.contact),(data_frame.day),(data_frame.month),(data_frame.duration),data_frame.campaign,(data_frame.pdays),(data_frame.previous),(data_frame.poutcome),data_frame.y]
  168. column=[]
  169. row=[]
  170. for i in range(0,len(columns)):
  171.     for j in range(i+1,len(columns)-1):
  172.         if columns[i].corr(columns[j])>0.6:
  173.             forplot.append(columns[i].corr(columns[j]))
  174.             column.append(features[i])
  175.             row.append(features[j])
  176. def corrr():
  177.     for i in range(0,len(columns)):
  178.         for j in range(i+1,len(columns)-1):
  179.             print 'corr btw',features[i],'and',features[j],columns[i].corr(columns[j])
  180.  
  181. def high_cor():
  182.     print 'WITH HIGH CORRELATIONS', forplot
  183. plt.figure(figsize=(45, 15))
  184. plots = len(forplot)
  185. ax=[]
  186. s=0
  187. f=0
  188.  
  189. ##############################
  190. # Drawing Scatter Plot
  191. def scatter_plot():
  192.     plt.figure(figsize=(45, 15))
  193.     plots = len(forplot)
  194.     ax=[]
  195.     s=0
  196.     f=0
  197.     for i in range(0,plots):
  198.             ax.append(plt.subplot2grid((5,4), (s,f)))
  199.             f+=1
  200.             ax[i].scatter(data_frame[row[i]],data_frame[column[i]],  s=10, c=[random.random(),random.random(),random.random()], marker="o")
  201.             #ax[i].plot(data_frame[row[i]],data_frame[column[i]])
  202.             ax[i].set_ylabel(row[i])
  203.             ax[i].set_xlabel(column[i])
  204.             if (i+1)%4==0:
  205.                 s+=1
  206.                 f=0
  207.     plt.show()
  208. inp='lylylylalyalyalyal'
  209. while inp!=' ':
  210.     print "1 - Manipulation"
  211.     print "2 - Accuracies"
  212.     print '3 - Show Scatter Plot'
  213.     print '4 - High Corrs'
  214.     print '5 - Logit Implementations'
  215.     print '6 - CORRS between features'
  216.     inp=raw_input('Enter The command: ')        
  217.     if inp=='1':
  218.         manipulation()
  219.     elif inp=='2':
  220.         accuracies()
  221.     elif inp=='3':
  222.         scatter_plot()
  223.     elif inp=='4':
  224.         high_cor()
  225.     elif inp=='5':
  226.         logits()
  227.     elif inp=='6':
  228.         corrr()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement