Untitled

# -*- coding: utf-8 -*-
"""
Created on Thu Mar 14 14:17:49 2019

@author: Ricky Hu
"""

from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier     #KNN
from sklearn.linear_model import LogisticRegression    #Logistic Regression
from sklearn.tree import DecisionTreeClassifier        #Decision Tree
from sklearn.ensemble import RandomForestClassifier    #Random Forest
from sklearn.neural_network import MLPClassifier       #Neural Network
from sklearn.svm import SVC                            #SVM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import export_graphviz
import matplotlib.pylab as plt
import numpy as np
import graphviz

%matplotlib inline
#%%
#load the breast cancer data
cancer = load_breast_cancer()
print(cancer.keys())
print(cancer.DESCR)

#%%
#print feature names to visualize
print(cancer.feature_names)
#%%
#print target names to visualize
print(cancer.target_names)
#%%
#look at dimensions of dataset
type(cancer.data)
cancer.data.shape
#%%
#plotting 2D of texture and perimeter
fig = plt.figure(figsize=(8,6))
plt.scatter(cancer.data[:,1], cancer.data[:,2], c=cancer.target)
plt.xlabel(str(cancer.feature_names[1]))
plt.ylabel(str(cancer.feature_names[2]))
plt.show()
#%%

#----------------Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=42)

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

print('Accuracy on the training set: {:.3f}'.format(log_reg.score(X_train,y_train)))
print('Accuracy on the training set: {:.3f}'.format(log_reg.score(X_test,y_test)))

#%%

#----------------- Decision Tree
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=42)

training_accuracy = []
test_accuracy = []

max_dep = range(1,15)

for md in max_dep:
    tree = DecisionTreeClassifier(max_depth=md,random_state=0)
    tree.fit(X_train,y_train)
    training_accuracy.append(tree.score(X_train, y_train))
    test_accuracy.append(tree.score(X_test, y_test))

plt.plot(max_dep,training_accuracy, label='Accuracy of the training set')
plt.plot(neighbors_setting,test_accuracy, label='Accuracy of the test set')
plt.ylabel('Accuracy')
plt.xlabel('Max Depth')
plt.legend()

# By having larger max_depth (>5), we overfit the model into training data, so the accuracy for training set become
# but the accuracy for test set decrease

# other parameters than can work with:
# - min_samples_leaf, max_sample_leaf
# - max_leaf_node

# by looking at plot, best result accurs when max_depth is 3

#%%
#exporting deciison tree

export_graphviz(tree, out_file=r"C:\Users\Ricky Hu\Desktop\ml\cancerTree.dot", class_names=['malignant','benign'], feature_names=cancer.feature_names, impurity=False, filled=True)
#%%

print('Feature importances: {}'.format(tree.feature_importances_))
type(tree.feature_importances_)
#%%

#Feature Importance
n_feature = cancer.data.shape[1]
plt.barh(range(n_feature), tree.feature_importances_, align='center')
plt.yticks(np.arange(n_feature), cancer.feature_names)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()