Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Q1 DECISION TREE -- SHRUTIKA PANDEY 102095004
- """
- # Commented out IPython magic to ensure Python compatibility.
- import pandas as pd
- import numpy as np
- import warnings
- import matplotlib.pyplot as plt
- warnings.filterwarnings('ignore')
- # %matplotlib inline
- """# Decision tree Dataset"""
- data_dict = {
- 'Outlook' : ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Overcast', 'Sunny', 'Sunny', 'Rainy', 'Sunny','Overcast', 'Overcast', 'Rainy', 'Rainy', 'Rainy']
- ,'Temperature': ['Hot', 'Hot', 'Hot', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild','Hot','Mild', 'Cool', 'Mild']
- ,'Humidity' : ['High', 'High', 'High', 'Normal', 'Normal', 'High', 'Normal', 'Normal','Normal','High', 'Normal', 'High', 'Normal', 'High']
- ,'Wind': ['False', 'True', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'True', 'True', 'False']
- ,'PlayGolf': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes']
- }
- golf_data = pd.DataFrame(data_dict, columns=data_dict.keys())
- golf_data
- """# Entropy"""
- # Commented out IPython magic to ensure Python compatibility.
- # %%latex
- #
- # Entropy = $-\sum_{i=1}^{n} P_i\times Log_b(P_i)$
- def entropy_calculate(prob_list):
- entropy = 0
- for item in prob_list:
- entropy -= item * np.log2(item)
- return entropy
- cases,counts = np.unique(golf_data.PlayGolf,return_counts=True)
- P = [count/len(golf_data) for count in counts]
- print('Probabilities of %s and %s are %.3f, %.3f respectively'%(cases[0],cases[1],P[0],P[1]))
- entropy_entire = entropy_calculate(P)
- print('Entire syetems entropy is %.3f bits'%entropy_entire)
- """# Information Gain
- # Outlook decision
- """
- cases_outlook,counts_outlook= np.unique(golf_data.Outlook,return_counts=True)
- P_outlook = [count/len(golf_data) for count in counts_outlook]
- print('For outlook:')
- for case, prob in zip(cases_outlook,P_outlook):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_outlook={}
- total_entropy_outlook=0
- for case, prob in zip(cases_outlook,P_outlook):
- cases,counts = np.unique(golf_data.PlayGolf[golf_data.Outlook==case],return_counts=True)
- P = [count/len(golf_data[golf_data.Outlook==case]) for count in counts]
- entropy_outlook[case]=entropy_calculate(P)
- total_entropy_outlook += entropy_calculate(P)*prob
- for case, entropy in entropy_outlook.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at Outlook decision level is %.3f'%total_entropy_outlook)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_outlook))
- """# Temperature Decision"""
- cases_temperature,counts_temperature= np.unique(golf_data.Temperature,return_counts=True)
- P_temperature = [count/len(golf_data) for count in counts_temperature]
- print('For temperature:')
- for case, prob in zip(cases_temperature,P_temperature):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_temperature={}
- total_entropy_temperature=0
- for case, prob in zip(cases_temperature,P_temperature):
- cases,counts = np.unique(golf_data.PlayGolf[golf_data.Temperature==case],return_counts=True)
- P = [count/len(golf_data[golf_data.Temperature==case]) for count in counts]
- entropy_temperature[case]=entropy_calculate(P)
- total_entropy_temperature += entropy_calculate(P)*prob
- for case, entropy in entropy_temperature.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at Temperature decision level is %.3f'%total_entropy_temperature)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_temperature))
- """# Wind Decision"""
- cases_wind,counts_wind= np.unique(golf_data.Wind,return_counts=True)
- P_wind = [count/len(golf_data) for count in counts_wind]
- print('For wind:')
- for case, prob in zip(cases_wind,P_wind):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_wind={}
- total_entropy_wind=0
- for case, prob in zip(cases_wind,P_wind):
- cases,counts = np.unique(golf_data.PlayGolf[golf_data.Wind==case],return_counts=True)
- P = [count/len(golf_data[golf_data.Wind==case]) for count in counts]
- entropy_wind[case]=entropy_calculate(P)
- total_entropy_wind += entropy_calculate(P)*prob
- for case, entropy in entropy_wind.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at Wind decision level is %.3f'%total_entropy_wind)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_wind))
- """# Humidity Decision"""
- cases_humidity,counts_humidity= np.unique(golf_data.Humidity,return_counts=True)
- P_humidity = [count/len(golf_data) for count in counts_humidity]
- print('For humidity:')
- for case, prob in zip(cases_humidity,P_humidity):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_humidity={}
- total_entropy_humidity=0
- for case, prob in zip(cases_humidity,P_humidity):
- cases,counts = np.unique(golf_data.PlayGolf[golf_data.Humidity==case],return_counts=True)
- P = [count/len(golf_data[golf_data.Humidity==case]) for count in counts]
- entropy_humidity[case]=entropy_calculate(P)
- total_entropy_humidity += entropy_calculate(P)*prob
- for case, entropy in entropy_humidity.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at Humidity decision level is %.3f'%total_entropy_humidity)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_humidity))
- #Training
- training_data = golf_data[['Outlook', 'Temperature', 'Humidity', 'Wind','PlayGolf']]
- training_data.head()
- category_map ={}
- for column in ['Outlook', 'Temperature', 'Humidity', 'Wind','PlayGolf']:
- category_map[column] = dict( enumerate(training_data[column].astype('category').cat.categories) )
- training_data[column] = training_data[column].astype('category').cat.codes
- training_data.head()
- training_data.dropna(inplace=True)
- training_data.reset_index(drop=True, inplace=True)
- print('Total number of valid records: {}'.format(len(training_data)))
- """# Split to train and test data"""
- from sklearn.model_selection import train_test_split
- X=training_data[['Outlook', 'Temperature', 'Humidity', 'Wind']]
- y=training_data[['PlayGolf']]
- from sklearn.model_selection import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
- print('Total number of records used for training: {}\nTotal number of records used for testing: {}'.format(len(X_train),len(X_test)))
- """# Build a decision tree"""
- from sklearn.tree import DecisionTreeClassifier
- X = X_train[['Outlook', 'Temperature', 'Humidity', 'Wind']]
- y = y_train[['PlayGolf']]
- clf = DecisionTreeClassifier(max_leaf_nodes=25, criterion='entropy')
- clf = clf.fit(X, y)
- clf
- clf.feature_importances_
- """# Tree visualization"""
- from sklearn import tree
- #X[0] -> Outlook,X[1] -> Temperature,X[2] -> Humidity,X[3] -> Wind
- fig, ax = plt.subplots(figsize=(10, 10))
- tree.plot_tree(clf, fontsize=10)
- plt.show()
- """# Evaluate the tree"""
- from sklearn.metrics import accuracy_score
- predictions = clf.predict(X_test)
- accuracy_score(y_test,predictions)
- y_pred = clf.predict(X_test)
- print(y_pred)
- #predicting for tommorow and today
- today_tommorow={
- 'Outlook':['Sunny','Sunny'],
- 'Temperature': ['Cool', 'Mild'],
- 'Humidity' : ['Normal', 'Normal'],
- 'Wind': ['False', 'False']
- }
- todtom_data = pd.DataFrame(today_tommorow, columns=today_tommorow.keys())
- todtom_data
- category_map ={}
- for column in ['Outlook', 'Temperature', 'Humidity', 'Wind']:
- category_map[column] = dict( enumerate(todtom_data[column].astype('category').cat.categories) )
- todtom_data[column] = todtom_data[column].astype('category').cat.codes
- todtom_data.head()
- todtom_data.dropna(inplace=True)
- todtom_data.reset_index(drop=True, inplace=True)
- print('Total number of valid records: {}'.format(len(todtom_data)))
- p = clf.predict(todtom_data)
- pred=['No','No']
- for i in range(0,2):
- if (p[i]==1) :
- pred[i]='Yes'
- print(f'prediction for today is {pred[0]}')
- print(f'prediction for tommorow is {pred[1]}')
- """# A) GINI IMPURITY -- SHRUTIKA PANDEY 102095004"""
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- import seaborn as sns
- data_dict = {
- 'Outlook' : ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Overcast', 'Sunny', 'Sunny', 'Rainy', 'Sunny','Overcast', 'Overcast', 'Rainy', 'Rainy', 'Rainy']
- ,'Temperature': ['Hot', 'Hot', 'Hot', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild','Hot','Mild', 'Cool', 'Mild']
- ,'Humidity' : ['High', 'High', 'High', 'Normal', 'Normal', 'High', 'Normal', 'Normal','Normal','High', 'Normal', 'High', 'Normal', 'High']
- ,'Wind': ['False', 'True', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'True', 'True', 'False']
- ,'PlayGolf': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes']
- }
- golf_data = pd.DataFrame(data_dict, columns=data_dict.keys())
- golf_data
- #Training
- training_data = golf_data[['Outlook', 'Temperature', 'Humidity', 'Wind','PlayGolf']]
- training_data.head()
- category_map ={}
- for column in ['Outlook', 'Temperature', 'Humidity', 'Wind','PlayGolf']:
- category_map[column] = dict( enumerate(training_data[column].astype('category').cat.categories) )
- training_data[column] = training_data[column].astype('category').cat.codes
- training_data.head()
- training_data.dropna(inplace=True)
- training_data.reset_index(drop=True, inplace=True)
- print('Total number of valid records: {}'.format(len(training_data)))
- from sklearn.model_selection import train_test_split
- X=training_data[['Outlook', 'Temperature', 'Humidity', 'Wind']]
- y=training_data[['PlayGolf']]
- from sklearn.model_selection import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
- print('Total number of records used for training: {}\nTotal number of records used for testing: {}'.format(len(X_train),len(X_test)))
- from sklearn.tree import DecisionTreeClassifier
- X = X_train[['Outlook', 'Temperature', 'Humidity', 'Wind']]
- y = y_train[['PlayGolf']]
- clf = DecisionTreeClassifier(max_leaf_nodes=25, criterion='gini')
- clf = clf.fit(X, y)
- clf.feature_importances_
- from sklearn import tree
- #X[0] -> Outlook,X[1] -> Temperature,X[2] -> Humidity,X[3] -> Wind
- fig, ax = plt.subplots(figsize=(10, 10))
- tree.plot_tree(clf, fontsize=10)
- plt.show()
- from sklearn.metrics import accuracy_score
- predictions = clf.predict(X_test)
- accuracy_score(y_test,predictions)
- y_pred = clf.predict(X_test)
- print(y_pred)
- #predicting for tommorow and today
- today_tommorow={
- 'Outlook':['Sunny','Sunny'],
- 'Temperature': ['Cool', 'Mild'],
- 'Humidity' : ['Normal', 'Normal'],
- 'Wind': ['False', 'False']
- }
- todtom_data = pd.DataFrame(today_tommorow, columns=today_tommorow.keys())
- todtom_data
- category_map ={}
- for column in ['Outlook', 'Temperature', 'Humidity', 'Wind']:
- category_map[column] = dict( enumerate(todtom_data[column].astype('category').cat.categories) )
- todtom_data[column] = todtom_data[column].astype('category').cat.codes
- todtom_data.head()
- todtom_data.dropna(inplace=True)
- todtom_data.reset_index(drop=True, inplace=True)
- print('Total number of valid records: {}'.format(len(todtom_data)))
- p = clf.predict(todtom_data)
- pred=['No','No']
- for i in range(0,2):
- if (p[i]==1) :
- pred[i]='Yes'
- print(f'prediction for today is {pred[0]}')
- print(f'prediction for tommorow is {pred[1]}')
- """# Q2 MULTIPLE LINEAR REGRESSSION - 102095004
- # B) MULTIPLE REGRESSION
- """
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- import seaborn as sns
- data = pd.read_csv('BostonHousing.csv')
- data.head()
- #data=pd.read_csv('BostonHousing.csv')
- # X= pd.DataFrame(data.iloc[:,:-1])
- # y= pd.DataFrame(data.iloc[:,-1])
- X = data.iloc[:, :-1].values
- y = data.iloc[:, -1].values
- print(X)
- print(y)
- sns.heatmap(data.corr())
- from sklearn.model_selection import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=50)
- from sklearn.linear_model import LinearRegression
- lr = LinearRegression()
- lr.fit(X_train,y_train)
- y_pred = lr.predict(X_test)
- print(y_pred)
- from sklearn.metrics import r2_score
- r2_score(y_test,y_pred)
- plt.scatter(y_test,y_pred)
- plt.xlabel('actual')
- plt.ylabel('predicted')
- plt.title('actual vs predicted')
- plt.show()
- pred_y_dataset = pd.DataFrame({'Actual':y_test,'Predicted':y_pred,'Difference': y_test-y_pred})
- pred_y_dataset[0:10]
- y_pred=lr.predict(X_test)
- y_pred=pd.DataFrame(y_pred,columns=['Predicted value'])
- y_pred
- y_test
- #coeff_df=pd.concat([w,v],axis=1,join='inner')
- #coeff_df
- from sklearn import metrics
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
- print('Resultant Mean Square Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
- """# C) DECISION TREE FOR IRIS DATASET -- SHRUTIKA PANDEY 102095004"""
- import pandas as pd
- import numpy as np
- import warnings
- import matplotlib.pyplot as plt
- warnings.filterwarnings('ignore')
- dataset = pd.read_csv('Iris.csv')
- dataset
- # Commented out IPython magic to ensure Python compatibility.
- # %%latex
- #
- # Entropy = $-\sum_{i=1}^{n} P_i\times Log_b(P_i)$
- def entropy_calculate(prob_list):
- entropy = 0
- for item in prob_list:
- entropy -= item * np.log2(item)
- return entropy
- cases,counts = np.unique(dataset.Species,return_counts=True)
- P = [count/len(dataset) for count in counts]
- print('Probabilities of %s and %s are %.3f, %.3f respectively'%(cases[0],cases[1],P[0],P[1]))
- entropy_entire = entropy_calculate(P)
- print('Entire syetems entropy is %.3f bits'%entropy_entire)
- cases_SepalLengthCm,counts_SepalLengthCm= np.unique(dataset.SepalLengthCm,return_counts=True)
- P_SepalLengthCm = [count/len(dataset) for count in counts_SepalLengthCm]
- print('For SepalLengthCm:')
- for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_SepalLengthCm={}
- total_entropy_SepalLengthCm=0
- for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
- cases,counts = np.unique(dataset.Species[dataset.SepalLengthCm==case],return_counts=True)
- P = [count/len(dataset[dataset.SepalLengthCm==case]) for count in counts]
- entropy_SepalLengthCm[case]=entropy_calculate(P)
- total_entropy_SepalLengthCm += entropy_calculate(P)*prob
- for case, entropy in entropy_SepalLengthCm.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at SepalLengthCm decision level is %.3f'%total_entropy_SepalLengthCm)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalLengthCm))
- cases_SepalWidthCm,counts_SepalWidthCm= np.unique(dataset.SepalWidthCm,return_counts=True)
- P_SepalWidthCm = [count/len(dataset) for count in counts_SepalWidthCm]
- print('For SepalWidthCm:')
- for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_SepalWidthCm={}
- total_entropy_SepalWidthCm=0
- for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
- cases,counts = np.unique(dataset.Species[dataset.SepalWidthCm==case],return_counts=True)
- P = [count/len(dataset[dataset.SepalWidthCm==case]) for count in counts]
- entropy_SepalWidthCm[case]=entropy_calculate(P)
- total_entropy_SepalWidthCm += entropy_calculate(P)*prob
- for case, entropy in entropy_SepalWidthCm.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at SepalWidthCm decision level is %.3f'%total_entropy_SepalWidthCm)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalWidthCm))
- cases_PetalLengthCm,counts_PetalLengthCm= np.unique(dataset.PetalLengthCm,return_counts=True)
- P_PetalLengthCm = [count/len(dataset) for count in counts_PetalLengthCm]
- print('For PetalLengthCm:')
- for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_PetalLengthCm={}
- total_entropy_PetalLengthCm=0
- for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
- cases,counts = np.unique(dataset.Species[dataset.PetalLengthCm==case],return_counts=True)
- P = [count/len(dataset[dataset.PetalLengthCm==case]) for count in counts]
- entropy_PetalLengthCm[case]=entropy_calculate(P)
- total_entropy_PetalLengthCm += entropy_calculate(P)*prob
- for case, entropy in entropy_PetalLengthCm.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at PetalLengthCm decision level is %.3f'%total_entropy_PetalLengthCm)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalLengthCm))
- cases_PetalWidthCm,counts_PetalWidthCm= np.unique(dataset.PetalWidthCm,return_counts=True)
- P_PetalWidthCm = [count/len(dataset) for count in counts_PetalWidthCm]
- print('For PetalWidthCm:')
- for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_PetalWidthCm={}
- total_entropy_PetalWidthCm=0
- for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
- cases,counts = np.unique(dataset.Species[dataset.PetalWidthCm==case],return_counts=True)
- P = [count/len(dataset[dataset.PetalWidthCm==case]) for count in counts]
- entropy_PetalWidthCm[case]=entropy_calculate(P)
- total_entropy_PetalWidthCm += entropy_calculate(P)*prob
- for case, entropy in entropy_PetalWidthCm.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at PetalWidthCm decision level is %.3f'%total_entropy_PetalWidthCm)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalWidthCm))
- #Training
- training_data = dataset[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']]
- training_data.head()
- category_map ={}
- for column in ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']:
- category_map[column] = dict( enumerate(training_data[column].astype('category').cat.categories) )
- training_data[column] = training_data[column].astype('category').cat.codes
- training_data.head()
- training_data.dropna(inplace=True)
- training_data.reset_index(drop=True, inplace=True)
- print('Total number of valid records: {}'.format(len(training_data)))
- from sklearn.model_selection import train_test_split
- X=training_data[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
- y=training_data[['Species']]
- from sklearn.model_selection import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
- print('Total number of records used for training: {}\nTotal number of records used for testin: {}'.format(len(X_train),len(X_test)))
- from sklearn.tree import DecisionTreeClassifier
- X = X_train[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
- y = y_train[['Species']]
- clf = DecisionTreeClassifier(max_leaf_nodes=25, criterion='entropy')
- clf = clf.fit(X, y)
- clf
- clf.feature_importances_
- from sklearn import tree
- #X[0] -> SepalLengthCm,X[1] -> SepalWidthCm,X[2] -> PetalLengthCm,X[3] ->PetalWidthCm
- fig, ax = plt.subplots(figsize=(10, 10))
- tree.plot_tree(clf, fontsize=10)
- plt.show()
- from sklearn.metrics import accuracy_score
- predictions = clf.predict(X_test)
- accuracy_score(y_test,predictions)
- y_pred = clf.predict(X_test)
- print(y_pred)
Advertisement
Add Comment
Please, Sign In to add comment