Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Thu Mar 14 14:17:49 2019
- @author: Ricky Hu
- """
- from sklearn.datasets import load_breast_cancer
- from sklearn.neighbors import KNeighborsClassifier #KNN
- from sklearn.linear_model import LogisticRegression #Logistic Regression
- from sklearn.tree import DecisionTreeClassifier #Decision Tree
- from sklearn.ensemble import RandomForestClassifier #Random Forest
- from sklearn.neural_network import MLPClassifier #Neural Network
- from sklearn.svm import SVC #SVM
- from sklearn.model_selection import train_test_split
- from sklearn.preprocessing import StandardScaler
- from sklearn.tree import export_graphviz
- import matplotlib.pylab as plt
- import numpy as np
- import graphviz
- %matplotlib inline
- #%%
- #load the breast cancer data
- cancer = load_breast_cancer()
- print(cancer.keys())
- print(cancer.DESCR)
- #%%
- #print feature names to visualize
- print(cancer.feature_names)
- #%%
- #print target names to visualize
- print(cancer.target_names)
- #%%
- #look at dimensions of dataset
- type(cancer.data)
- cancer.data.shape
- #%%
- #plotting 2D of texture and perimeter
- fig = plt.figure(figsize=(8,6))
- plt.scatter(cancer.data[:,1], cancer.data[:,2], c=cancer.target)
- plt.xlabel(str(cancer.feature_names[1]))
- plt.ylabel(str(cancer.feature_names[2]))
- plt.show()
- #%%
- #----------------Logistic Regression
- X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=42)
- log_reg = LogisticRegression()
- log_reg.fit(X_train, y_train)
- print('Accuracy on the training set: {:.3f}'.format(log_reg.score(X_train,y_train)))
- print('Accuracy on the training set: {:.3f}'.format(log_reg.score(X_test,y_test)))
- #%%
- #----------------- Decision Tree
- X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=42)
- training_accuracy = []
- test_accuracy = []
- max_dep = range(1,15)
- for md in max_dep:
- tree = DecisionTreeClassifier(max_depth=md,random_state=0)
- tree.fit(X_train,y_train)
- training_accuracy.append(tree.score(X_train, y_train))
- test_accuracy.append(tree.score(X_test, y_test))
- plt.plot(max_dep,training_accuracy, label='Accuracy of the training set')
- plt.plot(neighbors_setting,test_accuracy, label='Accuracy of the test set')
- plt.ylabel('Accuracy')
- plt.xlabel('Max Depth')
- plt.legend()
- # By having larger max_depth (>5), we overfit the model into training data, so the accuracy for training set become
- # but the accuracy for test set decrease
- # other parameters than can work with:
- # - min_samples_leaf, max_sample_leaf
- # - max_leaf_node
- # by looking at plot, best result accurs when max_depth is 3
- #%%
- #exporting deciison tree
- export_graphviz(tree, out_file=r"C:\Users\Ricky Hu\Desktop\ml\cancerTree.dot", class_names=['malignant','benign'], feature_names=cancer.feature_names, impurity=False, filled=True)
- #%%
- print('Feature importances: {}'.format(tree.feature_importances_))
- type(tree.feature_importances_)
- #%%
- #Feature Importance
- n_feature = cancer.data.shape[1]
- plt.barh(range(n_feature), tree.feature_importances_, align='center')
- plt.yticks(np.arange(n_feature), cancer.feature_names)
- plt.xlabel('Feature Importance')
- plt.ylabel('Feature')
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement