Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Q1. Use object oriented programming to write Python script for Decision Tree Classification algorithm without using any existing standard class. Create your own Class and functions which can perform the following functionality:
- • Function to calculate the entropy
- • Function to calculate the Information gain
- • Function to find maximum Information gain
- """
- import pandas as pd
- import numpy as np
- import warnings
- import matplotlib.pyplot as plt
- warnings.filterwarnings('ignore')
- dataset = pd.read_csv('Iris.csv')
- dataset
- # Commented out IPython magic to ensure Python compatibility.
- # %%latex
- #
- # Entropy = $-\sum_{i=1}^{n} P_i\times Log_b(P_i)$
- def entropy_calculate(prob_list):
- entropy = 0
- for item in prob_list:
- entropy -= item * np.log2(item)
- return entropy
- cases,counts = np.unique(dataset.Species,return_counts=True)
- P = [count/len(dataset) for count in counts]
- print('Probabilities of %s and %s are %.3f, %.3f respectively'%(cases[0],cases[1],P[0],P[1]))
- entropy_entire = entropy_calculate(P)
- print('Entire syetems entropy is %.3f bits'%entropy_entire)
- cases_SepalLengthCm,counts_SepalLengthCm= np.unique(dataset.SepalLengthCm,return_counts=True)
- P_SepalLengthCm = [count/len(dataset) for count in counts_SepalLengthCm]
- print('For SepalLengthCm:')
- for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_SepalLengthCm={}
- total_entropy_SepalLengthCm=0
- for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
- cases,counts = np.unique(dataset.Species[dataset.SepalLengthCm==case],return_counts=True)
- P = [count/len(dataset[dataset.SepalLengthCm==case]) for count in counts]
- entropy_SepalLengthCm[case]=entropy_calculate(P)
- total_entropy_SepalLengthCm += entropy_calculate(P)*prob
- for case, entropy in entropy_SepalLengthCm.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at SepalLengthCm decision level is %.3f'%total_entropy_SepalLengthCm)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalLengthCm))
- cases_SepalWidthCm,counts_SepalWidthCm= np.unique(dataset.SepalWidthCm,return_counts=True)
- P_SepalWidthCm = [count/len(dataset) for count in counts_SepalWidthCm]
- print('For SepalWidthCm:')
- for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_SepalWidthCm={}
- total_entropy_SepalWidthCm=0
- for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
- cases,counts = np.unique(dataset.Species[dataset.SepalWidthCm==case],return_counts=True)
- P = [count/len(dataset[dataset.SepalWidthCm==case]) for count in counts]
- entropy_SepalWidthCm[case]=entropy_calculate(P)
- total_entropy_SepalWidthCm += entropy_calculate(P)*prob
- for case, entropy in entropy_SepalWidthCm.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at SepalWidthCm decision level is %.3f'%total_entropy_SepalWidthCm)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalWidthCm))
- cases_PetalLengthCm,counts_PetalLengthCm= np.unique(dataset.PetalLengthCm,return_counts=True)
- P_PetalLengthCm = [count/len(dataset) for count in counts_PetalLengthCm]
- print('For PetalLengthCm:')
- for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_PetalLengthCm={}
- total_entropy_PetalLengthCm=0
- for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
- cases,counts = np.unique(dataset.Species[dataset.PetalLengthCm==case],return_counts=True)
- P = [count/len(dataset[dataset.PetalLengthCm==case]) for count in counts]
- entropy_PetalLengthCm[case]=entropy_calculate(P)
- total_entropy_PetalLengthCm += entropy_calculate(P)*prob
- for case, entropy in entropy_PetalLengthCm.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at PetalLengthCm decision level is %.3f'%total_entropy_PetalLengthCm)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalLengthCm))
- cases_PetalWidthCm,counts_PetalWidthCm= np.unique(dataset.PetalWidthCm,return_counts=True)
- P_PetalWidthCm = [count/len(dataset) for count in counts_PetalWidthCm]
- print('For PetalWidthCm:')
- for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_PetalWidthCm={}
- total_entropy_PetalWidthCm=0
- for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
- cases,counts = np.unique(dataset.Species[dataset.PetalWidthCm==case],return_counts=True)
- P = [count/len(dataset[dataset.PetalWidthCm==case]) for count in counts]
- entropy_PetalWidthCm[case]=entropy_calculate(P)
- total_entropy_PetalWidthCm += entropy_calculate(P)*prob
- for case, entropy in entropy_PetalWidthCm.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at PetalWidthCm decision level is %.3f'%total_entropy_PetalWidthCm)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalWidthCm))
- #Training
- training_data = dataset[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']]
- training_data.head()
- category_map ={}
- for column in ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']:
- category_map[column] = dict( enumerate(training_data[column].astype('category').cat.categories) )
- training_data[column] = training_data[column].astype('category').cat.codes
- training_data.head()
- training_data.dropna(inplace=True)
- training_data.reset_index(drop=True, inplace=True)
- print('Total number of valid records: {}'.format(len(training_data)))
- from sklearn.model_selection import train_test_split
- X=training_data[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
- y=training_data[['Species']]
- from sklearn.model_selection import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
- print('Total number of records used for training: {}\nTotal number of records used for testin: {}'.format(len(X_train),len(X_test)))
- from sklearn.tree import DecisionTreeClassifier
- X = X_train[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
- y = y_train[['Species']]
- clf = DecisionTreeClassifier(max_leaf_nodes=25, criterion='entropy')
- clf = clf.fit(X, y)
- clf
- clf.feature_importances_
- from sklearn.metrics import accuracy_score
- predictions = clf.predict(X_test)
- accuracy_score(y_test,predictions)
- y_pred = clf.predict(X_test)
- print(y_pred)
- """# Q.2 Write Python script to implement Random Forest Classifier. Use standard Class of Python to implement the algorithm by choosing your own dataset.
- Data Set Characteristics:
- Number of Instances
- 1797
- Number of Attributes
- 64
- Attribute Information
- 8x8 image of integer pixels in the range 0..16.
- Missing Attribute Values
- None
- The data set contains images of hand-written digits: 10 classes where each class refers to a digit.
- """
- # using standard class of python
- from collections import Counter
- import numpy as np
- from sklearn.tree import DecisionTreeClassifier
- def bootstrap_sample(X, y):
- n_samples = X.shape[0]
- idxs = np.random.choice(n_samples, n_samples, replace=True)
- return X[idxs], y[idxs]
- def most_common_label(y):
- counter = Counter(y)
- most_common = counter.most_common(1)[0][0]
- return most_common
- class RandomForest:
- def __init__(self, n_trees=10, min_samples_split=2, max_depth=100, n_feats=None):
- self.n_trees = n_trees
- self.min_samples_split = min_samples_split
- self.max_depth = max_depth
- self.n_feats = n_feats
- self.trees = []
- def fit(self, X, y):
- self.trees = []
- for _ in range(self.n_trees):
- tree = DecisionTreeClassifier(min_samples_split=self.min_samples_split,max_depth=self.max_depth,max_features=self.n_feats,)
- X_samp, y_samp = bootstrap_sample(X, y)
- tree.fit(X_samp, y_samp)
- self.trees.append(tree)
- def predict(self, X):
- tree_preds = np.array([tree.predict(X) for tree in self.trees])
- tree_preds = np.swapaxes(tree_preds, 0, 1)
- y_pred = [most_common_label(tree_pred) for tree_pred in tree_preds]
- return np.array(y_pred)
- # Importing relevant libraries
- from sklearn import datasets
- from sklearn.model_selection import train_test_split
- #function for accuracy
- def accuracy(y_true, y_pred):
- accuracy = np.sum(y_true == y_pred) / len(y_true)
- return accuracy
- #loading dataset
- data = datasets.load_digits()
- X = data.data
- y = data.target
- #splitting dataset into training and testing data
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
- clf = RandomForest(n_trees=3, max_depth=10)
- clf.fit(X_train, y_train)
- y_pred = clf.predict(X_test)
- acc = accuracy(y_test, y_pred)
- print("Accuracy:", acc)
- """# Q3. Write Python script to implement K-Nearest Neighbours. Use standard Class of Python to implement the algorithm by choosing your own dataset."""
- #python algorithm for KNN using standard class of python
- from collections import Counter
- import numpy as np
- def euclidean_distance(x1, x2):
- return np.sqrt(np.sum((x1 - x2) ** 2))
- class KNN:
- def __init__(self, k=3):
- self.k = k
- def fit(self, X, y):
- self.X_train = X
- self.y_train = y
- def predict(self, X):
- y_pred = [self._predict(x) for x in X]
- return np.array(y_pred)
- def _predict(self, x):
- # Compute distances between x and all examples in the training set
- distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
- # Sort by distance and return indices of the first k neighbors
- k_idx = np.argsort(distances)[: self.k]
- # Extract the labels of the k nearest neighbor training samples
- k_neighbor_labels = [self.y_train[i] for i in k_idx]
- # return the most common class label
- most_common = Counter(k_neighbor_labels).most_common(1)
- return most_common[0][0]
- # Importing relevant libraries
- from matplotlib.colors import ListedColormap
- from sklearn import datasets
- from sklearn.model_selection import train_test_split
- cmap = ListedColormap(["#FF0000", "#00FF00", "#0000FF"])
- #function to check accuracy
- def accuracy(y_true, y_pred):
- accuracy = np.sum(y_true == y_pred) / len(y_true)
- return accuracy
- #loading dataset
- iris = datasets.load_iris()
- X, y = iris.data, iris.target
- #splitting data into training and testing data
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
- #checking accuracy
- k = 3
- clf = KNN(k=k)
- clf.fit(X_train, y_train)
- predictions = clf.predict(X_test)
- print("KNN classification accuracy", accuracy(y_test, predictions))
- """# Q4. Write Python script to implement Naïve Bayes Classifier. Use standard Class of Python to implement the algorithm by choosing your own dataset.
- """
- import numpy as np
- class NaiveBayes:
- def fit(self, X, y):
- n_samples, n_features = X.shape
- self._classes = np.unique(y)
- n_classes = len(self._classes)
- # calculate mean, var, and prior for each class
- self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
- self._var = np.zeros((n_classes, n_features), dtype=np.float64)
- self._priors = np.zeros(n_classes, dtype=np.float64)
- for idx, c in enumerate(self._classes):
- X_c = X[y == c]
- self._mean[idx, :] = X_c.mean(axis=0)
- self._var[idx, :] = X_c.var(axis=0)
- self._priors[idx] = X_c.shape[0] / float(n_samples)
- def predict(self, X):
- y_pred = [self._predict(x) for x in X]
- return np.array(y_pred)
- def _predict(self, x):
- posteriors = []
- # calculate posterior probability for each class
- for idx, c in enumerate(self._classes):
- prior = np.log(self._priors[idx])
- posterior = np.sum(np.log(self._pdf(idx, x)))
- posterior = prior + posterior
- posteriors.append(posterior)
- # return class with highest posterior probability
- return self._classes[np.argmax(posteriors)]
- def _pdf(self, class_idx, x):
- mean = self._mean[class_idx]
- var = self._var[class_idx]
- numerator = np.exp(-((x - mean) ** 2) / (2 * var))
- denominator = np.sqrt(2 * np.pi * var)
- return numerator / denominator
- # Imports
- from sklearn.model_selection import train_test_split
- from sklearn.datasets import load_iris
- def accuracy(y_true, y_pred):
- accuracy = np.sum(y_true == y_pred) / len(y_true)
- return accuracy
- iris = datasets.load_iris()
- X = iris.data[:,:2]
- y= iris.target
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
- nb = NaiveBayes()
- nb.fit(X_train, y_train)
- predictions = nb.predict(X_test)
- print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement