Advertisement
CJamie

implementation

Mar 23rd, 2022
855
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.97 KB | None | 0 0
  1. # Q1. Use object oriented programming to write Python script for Decision Tree Classification algorithm without using any existing standard class. Create your own Class and functions which can perform the following functionality:
  2. • Function to calculate the entropy
  3. • Function to calculate the Information gain
  4. • Function to find maximum Information gain
  5. """
  6.  
  7. import pandas as pd
  8. import numpy as np
  9. import warnings
  10. import matplotlib.pyplot as plt
  11. warnings.filterwarnings('ignore')
  12.  
  13. dataset = pd.read_csv('Iris.csv')
  14. dataset
  15.  
  16. # Commented out IPython magic to ensure Python compatibility.
  17. # %%latex
  18. #
  19. # Entropy = $-\sum_{i=1}^{n} P_i\times Log_b(P_i)$
  20.  
  21. def entropy_calculate(prob_list):
  22.  
  23.    entropy = 0
  24.    for item in prob_list:
  25.        entropy -= item * np.log2(item)
  26.    return entropy
  27.  
  28. cases,counts = np.unique(dataset.Species,return_counts=True)
  29. P = [count/len(dataset) for count in counts]
  30. print('Probabilities of %s and %s are %.3f, %.3f respectively'%(cases[0],cases[1],P[0],P[1]))
  31.  
  32. entropy_entire = entropy_calculate(P)
  33.  
  34. print('Entire syetems entropy is %.3f bits'%entropy_entire)
  35.  
  36. cases_SepalLengthCm,counts_SepalLengthCm= np.unique(dataset.SepalLengthCm,return_counts=True)
  37. P_SepalLengthCm = [count/len(dataset) for count in counts_SepalLengthCm]
  38. print('For SepalLengthCm:')
  39. for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
  40.    print('\tProbabality of %s is %.3f'%(case, prob))
  41.  
  42. entropy_SepalLengthCm={}
  43. total_entropy_SepalLengthCm=0
  44. for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
  45.    cases,counts = np.unique(dataset.Species[dataset.SepalLengthCm==case],return_counts=True)
  46.    P = [count/len(dataset[dataset.SepalLengthCm==case]) for count in counts]
  47.    entropy_SepalLengthCm[case]=entropy_calculate(P)
  48.    total_entropy_SepalLengthCm += entropy_calculate(P)*prob
  49.  
  50. for case, entropy in entropy_SepalLengthCm.items():
  51.    print('Entropy for %s is %.2f'%(case,entropy))
  52. print('\nEntropy at SepalLengthCm decision level is %.3f'%total_entropy_SepalLengthCm)
  53. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalLengthCm))
  54.  
  55. cases_SepalWidthCm,counts_SepalWidthCm= np.unique(dataset.SepalWidthCm,return_counts=True)
  56. P_SepalWidthCm = [count/len(dataset) for count in counts_SepalWidthCm]
  57. print('For SepalWidthCm:')
  58. for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
  59.    print('\tProbabality of %s is %.3f'%(case, prob))
  60.  
  61. entropy_SepalWidthCm={}
  62. total_entropy_SepalWidthCm=0
  63. for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
  64.    cases,counts = np.unique(dataset.Species[dataset.SepalWidthCm==case],return_counts=True)
  65.    P = [count/len(dataset[dataset.SepalWidthCm==case]) for count in counts]
  66.    entropy_SepalWidthCm[case]=entropy_calculate(P)
  67.    total_entropy_SepalWidthCm += entropy_calculate(P)*prob
  68.  
  69. for case, entropy in entropy_SepalWidthCm.items():
  70.    print('Entropy for %s is %.2f'%(case,entropy))
  71. print('\nEntropy at SepalWidthCm decision level is %.3f'%total_entropy_SepalWidthCm)
  72. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalWidthCm))
  73.  
  74. cases_PetalLengthCm,counts_PetalLengthCm= np.unique(dataset.PetalLengthCm,return_counts=True)
  75. P_PetalLengthCm = [count/len(dataset) for count in counts_PetalLengthCm]
  76. print('For PetalLengthCm:')
  77. for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
  78.    print('\tProbabality of %s is %.3f'%(case, prob))
  79.  
  80. entropy_PetalLengthCm={}
  81. total_entropy_PetalLengthCm=0
  82. for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
  83.    cases,counts = np.unique(dataset.Species[dataset.PetalLengthCm==case],return_counts=True)
  84.    P = [count/len(dataset[dataset.PetalLengthCm==case]) for count in counts]
  85.    entropy_PetalLengthCm[case]=entropy_calculate(P)
  86.    total_entropy_PetalLengthCm += entropy_calculate(P)*prob
  87.  
  88. for case, entropy in entropy_PetalLengthCm.items():
  89.    print('Entropy for %s is %.2f'%(case,entropy))
  90. print('\nEntropy at PetalLengthCm decision level is %.3f'%total_entropy_PetalLengthCm)
  91. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalLengthCm))
  92.  
  93. cases_PetalWidthCm,counts_PetalWidthCm= np.unique(dataset.PetalWidthCm,return_counts=True)
  94. P_PetalWidthCm = [count/len(dataset) for count in counts_PetalWidthCm]
  95. print('For PetalWidthCm:')
  96. for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
  97.    print('\tProbabality of %s is %.3f'%(case, prob))
  98.  
  99. entropy_PetalWidthCm={}
  100. total_entropy_PetalWidthCm=0
  101. for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
  102.    cases,counts = np.unique(dataset.Species[dataset.PetalWidthCm==case],return_counts=True)
  103.    P = [count/len(dataset[dataset.PetalWidthCm==case]) for count in counts]
  104.    entropy_PetalWidthCm[case]=entropy_calculate(P)
  105.    total_entropy_PetalWidthCm += entropy_calculate(P)*prob
  106.  
  107. for case, entropy in entropy_PetalWidthCm.items():
  108.    print('Entropy for %s is %.2f'%(case,entropy))
  109. print('\nEntropy at PetalWidthCm decision level is %.3f'%total_entropy_PetalWidthCm)
  110. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalWidthCm))
  111.  
  112. #Training
  113. training_data = dataset[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']]
  114. training_data.head()
  115.  
  116. category_map ={}
  117. for column in ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']:
  118.    category_map[column] = dict( enumerate(training_data[column].astype('category').cat.categories) )
  119.    training_data[column] = training_data[column].astype('category').cat.codes
  120. training_data.head()
  121.  
  122. training_data.dropna(inplace=True)
  123. training_data.reset_index(drop=True, inplace=True)
  124. print('Total number of valid records: {}'.format(len(training_data)))
  125.  
  126. from sklearn.model_selection import train_test_split
  127.  
  128. X=training_data[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
  129. y=training_data[['Species']]
  130.  
  131. from sklearn.model_selection import train_test_split
  132. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
  133.  
  134. print('Total number of records used for training: {}\nTotal number of records used for testin: {}'.format(len(X_train),len(X_test)))
  135.  
  136. from sklearn.tree import DecisionTreeClassifier
  137.  
  138. X = X_train[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
  139. y = y_train[['Species']]
  140.  
  141. clf = DecisionTreeClassifier(max_leaf_nodes=25, criterion='entropy')
  142.  
  143. clf = clf.fit(X, y)
  144.  
  145. clf
  146.  
  147. clf.feature_importances_
  148.  
  149. from sklearn.metrics import accuracy_score
  150.  
  151. predictions = clf.predict(X_test)
  152.  
  153. accuracy_score(y_test,predictions)
  154.  
  155. y_pred = clf.predict(X_test)
  156. print(y_pred)
  157.  
  158.  
  159.  
  160. """# Q.2 Write Python script to implement Random Forest Classifier. Use standard Class of Python to implement the algorithm by choosing your own dataset.
  161.  
  162. Data Set Characteristics:
  163. Number of Instances
  164. 1797
  165. Number of Attributes
  166. 64
  167. Attribute Information
  168. 8x8 image of integer pixels in the range 0..16.
  169. Missing Attribute Values
  170. None
  171. The data set contains images of hand-written digits: 10 classes where each class refers to a digit.
  172. """
  173.  
  174. # using standard class of python
  175. from collections import Counter
  176.  
  177. import numpy as np
  178.  
  179. from sklearn.tree import DecisionTreeClassifier
  180.  
  181.  
  182. def bootstrap_sample(X, y):
  183.    n_samples = X.shape[0]
  184.    idxs = np.random.choice(n_samples, n_samples, replace=True)
  185.    return X[idxs], y[idxs]
  186.  
  187.  
  188. def most_common_label(y):
  189.    counter = Counter(y)
  190.    most_common = counter.most_common(1)[0][0]
  191.    return most_common
  192.  
  193.  
  194. class RandomForest:
  195.    def __init__(self, n_trees=10, min_samples_split=2, max_depth=100, n_feats=None):
  196.        self.n_trees = n_trees
  197.        self.min_samples_split = min_samples_split
  198.        self.max_depth = max_depth
  199.        self.n_feats = n_feats
  200.        self.trees = []
  201.  
  202.    def fit(self, X, y):
  203.        self.trees = []
  204.        for _ in range(self.n_trees):
  205.            tree =  DecisionTreeClassifier(min_samples_split=self.min_samples_split,max_depth=self.max_depth,max_features=self.n_feats,)
  206.            X_samp, y_samp = bootstrap_sample(X, y)
  207.            tree.fit(X_samp, y_samp)
  208.            self.trees.append(tree)
  209.  
  210.    def predict(self, X):
  211.        tree_preds = np.array([tree.predict(X) for tree in self.trees])
  212.        tree_preds = np.swapaxes(tree_preds, 0, 1)
  213.        y_pred = [most_common_label(tree_pred) for tree_pred in tree_preds]
  214.        return np.array(y_pred)
  215.  
  216. # Importing relevant libraries
  217. from sklearn import datasets
  218. from sklearn.model_selection import train_test_split
  219.  
  220. #function for accuracy  
  221. def accuracy(y_true, y_pred):
  222.    accuracy = np.sum(y_true == y_pred) / len(y_true)
  223.    return accuracy
  224.  
  225. #loading dataset
  226. data = datasets.load_digits()
  227. X = data.data
  228. y = data.target
  229.  
  230. #splitting dataset into training and testing data  
  231. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
  232.  
  233. clf = RandomForest(n_trees=3, max_depth=10)
  234.  
  235. clf.fit(X_train, y_train)
  236. y_pred = clf.predict(X_test)
  237. acc = accuracy(y_test, y_pred)
  238.  
  239. print("Accuracy:", acc)
  240.  
  241.  
  242.  
  243. """# Q3. Write Python script to implement K-Nearest Neighbours. Use standard Class of Python to implement the algorithm by choosing your own dataset."""
  244.  
  245. #python algorithm for KNN using standard class of python
  246. from collections import Counter
  247.  
  248. import numpy as np
  249.  
  250. def euclidean_distance(x1, x2):
  251.     return np.sqrt(np.sum((x1 - x2) ** 2))
  252.  
  253. class KNN:
  254.     def __init__(self, k=3):
  255.         self.k = k
  256.  
  257.     def fit(self, X, y):
  258.         self.X_train = X
  259.         self.y_train = y
  260.  
  261.     def predict(self, X):
  262.         y_pred = [self._predict(x) for x in X]
  263.         return np.array(y_pred)
  264.  
  265.     def _predict(self, x):
  266.         # Compute distances between x and all examples in the training set
  267.         distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
  268.         # Sort by distance and return indices of the first k neighbors
  269.         k_idx = np.argsort(distances)[: self.k]
  270.         # Extract the labels of the k nearest neighbor training samples
  271.         k_neighbor_labels = [self.y_train[i] for i in k_idx]
  272.         # return the most common class label
  273.         most_common = Counter(k_neighbor_labels).most_common(1)
  274.         return most_common[0][0]
  275.  
  276. # Importing relevant libraries
  277. from matplotlib.colors import ListedColormap
  278. from sklearn import datasets
  279. from sklearn.model_selection import train_test_split
  280.  
  281. cmap = ListedColormap(["#FF0000", "#00FF00", "#0000FF"])
  282.  
  283. #function to check accuracy
  284. def accuracy(y_true, y_pred):
  285.     accuracy = np.sum(y_true == y_pred) / len(y_true)
  286.     return accuracy
  287.  
  288. #loading dataset
  289. iris = datasets.load_iris()
  290. X, y = iris.data, iris.target
  291.  
  292. #splitting data into training and testing data
  293. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
  294.  
  295. #checking accuracy
  296. k = 3
  297. clf = KNN(k=k)
  298. clf.fit(X_train, y_train)
  299. predictions = clf.predict(X_test)
  300. print("KNN classification accuracy", accuracy(y_test, predictions))
  301.  
  302. """# Q4. Write Python script to implement Naïve Bayes Classifier. Use standard Class of Python to implement the algorithm by choosing your own dataset.
  303.        
  304.  
  305. """
  306.  
  307. import numpy as np
  308.  
  309.  
  310. class NaiveBayes:
  311.     def fit(self, X, y):
  312.         n_samples, n_features = X.shape
  313.         self._classes = np.unique(y)
  314.         n_classes = len(self._classes)
  315.  
  316.         # calculate mean, var, and prior for each class
  317.         self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
  318.         self._var = np.zeros((n_classes, n_features), dtype=np.float64)
  319.         self._priors = np.zeros(n_classes, dtype=np.float64)
  320.  
  321.         for idx, c in enumerate(self._classes):
  322.             X_c = X[y == c]
  323.             self._mean[idx, :] = X_c.mean(axis=0)
  324.             self._var[idx, :] = X_c.var(axis=0)
  325.             self._priors[idx] = X_c.shape[0] / float(n_samples)
  326.  
  327.     def predict(self, X):
  328.         y_pred = [self._predict(x) for x in X]
  329.         return np.array(y_pred)
  330.  
  331.     def _predict(self, x):
  332.         posteriors = []
  333.  
  334.         # calculate posterior probability for each class
  335.         for idx, c in enumerate(self._classes):
  336.             prior = np.log(self._priors[idx])
  337.             posterior = np.sum(np.log(self._pdf(idx, x)))
  338.             posterior = prior + posterior
  339.             posteriors.append(posterior)
  340.  
  341.         # return class with highest posterior probability
  342.         return self._classes[np.argmax(posteriors)]
  343.  
  344.     def _pdf(self, class_idx, x):
  345.         mean = self._mean[class_idx]
  346.         var = self._var[class_idx]
  347.         numerator = np.exp(-((x - mean) ** 2) / (2 * var))
  348.         denominator = np.sqrt(2 * np.pi * var)
  349.         return numerator / denominator
  350.  
  351. # Imports
  352. from sklearn.model_selection import train_test_split
  353. from sklearn.datasets import load_iris
  354.  
  355. def accuracy(y_true, y_pred):
  356.     accuracy = np.sum(y_true == y_pred) / len(y_true)
  357.     return accuracy
  358.  
  359. iris = datasets.load_iris()
  360. X = iris.data[:,:2]
  361. y= iris.target
  362. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
  363.  
  364. nb = NaiveBayes()
  365. nb.fit(X_train, y_train)
  366. predictions = nb.predict(X_test)
  367.  
  368. print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
  369.  
  370.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement