Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding: utf-8
- # In[ ]:
- '''
- import pandas as pd
- from datetime import datetime
- from pandas import *
- # Task 1
- df = pd.DataFrame.from_csv("hotels_data.csv")
- # do same but attach it to the dataframe
- df['DiscountDiff'] = df.apply(lambda row: row["Original Price"] - row["Discount Price"], axis=1)
- df['DayDiff'] = df.apply(lambda row: Timedelta(datetime.strptime(row["Checkin Date"], '%m/%d/%Y %H:%M') - datetime.strptime(row["Snapshot Date"], '%m/%d/%Y %H:%M')).days, axis=1)
- df['WeekDay'] = df.apply(lambda row: to_datetime(row["Checkin Date"]).weekday_name, axis=1)
- df['DiscountPerc'] = df.apply(lambda row: ((row["DiscountDiff"] * 100) / row["Original Price"]), axis=1)
- df.to_csv("Hotels_data_Changed.csv")
- '''
- # In[134]:
- import os
- import io
- import numpy
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.naive_bayes import MultinomialNB
- from scipy import sparse as sp
- import pandas as pd
- import matplotlib.pyplot as plt
- from pandas import *
- from sklearn.model_selection import *
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.naive_bayes import MultinomialNB
- from sklearn import datasets, linear_model
- from sklearn.tree import DecisionTreeClassifier, export_graphviz
- from sklearn.metrics import *
- from sklearn.preprocessing import label_binarize
- class NBClassifier():
- def Query(self, df):
- work_df = pd.DataFrame(df, columns = ["WeekDay", "Snapshot_Date", "Checkin_Date", "DayDiff", "Hotel_Name", "Discount_Code", "DiscountPerc"])
- a = work_df.sort_values("DiscountPerc", ascending=False).groupby(by=["WeekDay","Snapshot_Date","Checkin_Date","DayDiff","Hotel_Name"], as_index=True)
- d = pd.DataFrame(a.size().reset_index(name='counts'))
- pp= d[d['counts'] == 4]
- new_work_df = pd.merge(work_df, pp, on =["WeekDay", "Snapshot_Date", "Checkin_Date", "DayDiff", "Hotel_Name"], how='inner')
- new_work =pd.DataFrame(new_work_df.sort_values(["Snapshot_Date","Checkin_Date","Discount_Code"], ascending=[True, True, True]).reset_index(drop=True))
- a = new_work.sort_values("DiscountPerc", ascending=False).groupby(by=["WeekDay","Snapshot_Date","Checkin_Date","DayDiff","Hotel_Name"], as_index=False)
- d = pd.DataFrame(a["DiscountPerc"].max().reset_index(drop=True))
- new_work_df = pd.merge(new_work, d, on =["WeekDay", "Snapshot_Date", "Checkin_Date", "DayDiff", "Hotel_Name", "DiscountPerc"], how='inner')
- new_work_df = new_work_df.drop("counts", axis=1).drop("DiscountPerc", axis=1)
- new_work =pd.DataFrame(new_work_df.sort_values(["Snapshot_Date","Checkin_Date","Discount_Code"], ascending=[True, True, True]).reset_index(drop=True))
- new_work.sort_index(inplace=True)
- return new_work
- def Transform(self, df):
- snapshot_dates = pd.to_datetime(df['Snapshot_Date']).dt
- Snapshot_dayofweek = pd.DataFrame({'Snapshot_dayofweek':snapshot_dates.dayofweek})
- Snapshot_dayofmonth = pd.DataFrame({'Snapshot_dayofmonth':snapshot_dates.day})
- Snapshot_month = pd.DataFrame({'Snapshot_month':snapshot_dates.month})
- Snapshot_year = pd.DataFrame({'Snapshot_year':snapshot_dates.year})
- final_df = df.drop('Snapshot_Date',axis=1)
- final_df = final_df.join(pd.get_dummies(Snapshot_dayofweek['Snapshot_dayofweek']).add_prefix('Snapshot_dayofweek_'))
- final_df = final_df.join(pd.get_dummies(Snapshot_dayofmonth['Snapshot_dayofmonth']).add_prefix('Snapshot_dayofmonth_'))
- final_df = final_df.join(pd.get_dummies(Snapshot_month['Snapshot_month']).add_prefix('Snapshot_month_'))
- final_df = final_df.join(pd.get_dummies(Snapshot_year['Snapshot_year']).add_prefix('Snapshot_year_'))
- checkin_dates = pd.to_datetime(final_df['Checkin_Date']).dt
- Checkin_dayofweek = pd.DataFrame({'Checkin_dayofweek':checkin_dates.dayofweek})
- Checkin_dayofmonth = pd.DataFrame({'Checkin_dayofmonth':checkin_dates.day})
- Checkin_month = pd.DataFrame({'Checkin_month':checkin_dates.month})
- Checkin_year = pd.DataFrame({'Checkin_year':checkin_dates.year})
- final_df = final_df.drop('Checkin_Date',axis=1)
- final_df = final_df.drop('WeekDay',axis=1)
- final_df = final_df.join(pd.get_dummies(Checkin_dayofweek['Checkin_dayofweek']).add_prefix('Checkin_dayofweek_'))
- final_df = final_df.join(pd.get_dummies(Checkin_dayofmonth['Checkin_dayofmonth']).add_prefix('Checkin_dayofmonth_'))
- final_df = final_df.join(pd.get_dummies(Checkin_month['Checkin_month']).add_prefix('Checkin_month_'))
- final_df = final_df.join(pd.get_dummies(Checkin_year['Checkin_year']).add_prefix('Checkin_year_'))
- encoded_hotel_names = pd.get_dummies(final_df['Hotel_Name']).add_prefix('Hotel_Name_')
- final_df = final_df.drop('Hotel_Name', axis=1)
- final_df = final_df.join(encoded_hotel_names)
- op = list(final_df.columns)
- # Delete the discountperc column
- del op[1]
- final_df = pd.DataFrame(final_df.drop_duplicates(subset=op).reset_index(drop=True))
- return final_df
- def Classify(self, df):
- X = df.drop('Discount_Code', axis=1)
- Y = df['Discount_Code']
- X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.1, random_state = 100)
- # naive bayes
- clf = MultinomialNB()
- clf.fit(X_train, y_train)
- pred = clf.predict(X_test)
- self.confusion_matrix = confusion_matrix(y_test, pred)
- print(self.confusion_matrix)
- return accuracy_score(y_test, pred)
- def Results(self):
- # statistical measures
- FP = self.confusion_matrix.sum(axis=0) - np.diag(self.confusion_matrix)
- FN = self.confusion_matrix.sum(axis=1) - np.diag(self.confusion_matrix)
- TP = np.diag(self.confusion_matrix)
- TN = self.confusion_matrix.sum() - (FP + FN + TP)
- # Computing sums:
- x = np.array(TP)
- tp_sum = np.sum(x)
- x = np.array(TN)
- tn_sum = np.sum(x)
- x = np.array(FP)
- fp_sum = np.sum(x)
- x = np.array(FN)
- fn_sum = np.sum(x)
- print("The confusion matrix:")
- self.confusion_matrix
- return FN, FP
- class DTClassifier():
- def Query(self, df):
- #transform data to correct format
- a = df.sort_values("DiscountDiff", ascending=False).groupby(by=["Hotel_Name","Checkin_Date", "Snapshot_Date", "DayDiff", "WeekDay"], as_index=True)
- new_df = pd.DataFrame(a.first().reset_index())
- return new_df
- def Transform(self, df):
- all_days = {"Sunday" : 1, "Monday": 2, "Tuesday" : 3, "Wednesday" : 4, "Thursday" : 5, "Friday" : 6, "Saturday" : 7}
- df['Hotel_Name'] = df["Hotel_Name"].astype('category').cat.codes
- df['Checkin_Date'] = df.apply(lambda row: datetime.strptime(row["Checkin_Date"], '%m/%d/%Y %H:%M').timestamp(), axis=1)
- df['WeekDay'] = df["WeekDay"].map(all_days)
- df['Snapshot_Date'] = df.apply(lambda row: datetime.strptime(row["Snapshot_Date"], '%m/%d/%Y %H:%M').timestamp(), axis=1)
- return df
- def Classify(self, df):
- features = ["Checkin_Date", "Snapshot_Date", "Hotel_Name"]
- super_new_df = pd.DataFrame(df, columns=features)
- df_X = super_new_df[features]
- df_Y = df["Discount_Code"]
- X_train, X_test, y_train, y_test = train_test_split( df_X, df_Y, test_size = 0.1, random_state = 100)
- # descision tree
- clf = DecisionTreeClassifier(random_state=14)
- clf = clf.fit(X_train, y_train)
- pred = clf.predict(X_test)
- self.confusion_matrix = confusion_matrix(y_test, pred)
- print(self.confusion_matrix)
- return accuracy_score(y_test, pred)
- def GetROC(self, df):
- fpr = dict()
- tpr = dict()
- roc_auc = dict()
- features = ["Checkin_Date", "Snapshot_Date", "Hotel_Name"]
- super_new_df = pd.DataFrame(df, columns=features)
- df_X = super_new_df[features]
- df_Y = df["Discount_Code"]
- df_Y = label_binarize(df_Y, classes=[1, 2, 3, 4])
- X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size = 0.3, random_state = 100)
- clf = DecisionTreeClassifier(random_state=14)
- y_score = clf.fit(X_train, y_train).predict(X_test)
- #self.confusion_matrix = confusion_matrix(y_test, y_score)
- for i in range(4):
- fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
- roc_auc[i] = auc(fpr[i], tpr[i])
- # Compute micro-average ROC curve and ROC area
- fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
- roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
- plt.figure()
- lw = 2
- plt.plot(fpr[2], tpr[2], color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
- plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
- plt.xlim([0.0, 1.0])
- plt.ylim([0.0, 1.05])
- plt.xlabel('False Positive Rate')
- plt.ylabel('True Positive Rate')
- plt.title('Receiver operating characteristic example')
- plt.legend(loc="lower right")
- plt.show()
- return roc_auc
- def Results(self):
- # statistical measures
- FP = self.confusion_matrix.sum(axis=0) - np.diag(self.confusion_matrix)
- FN = self.confusion_matrix.sum(axis=1) - np.diag(self.confusion_matrix)
- TP = np.diag(self.confusion_matrix)
- TN = self.confusion_matrix.sum() - (FP + FN + TP)
- # Computing sums:
- x = np.array(TP)
- tp_sum = np.sum(x)
- x = np.array(TN)
- tn_sum = np.sum(x)
- x = np.array(FP)
- fp_sum = np.sum(x)
- x = np.array(FN)
- fn_sum = np.sum(x)
- print("The confusion matrix:")
- self.confusion_matrix
- return FN, FP
- print("Reading from CSV into dataframe.")
- df = pd.read_csv("Hotels_data_Changed.csv", encoding = "ISO-8859-1")
- df['Checkin_Date'] = df['Checkin Date']
- df['Snapshot_Date'] = df['Snapshot Date']
- df['Hotel_Name'] = df['Hotel Name']
- df['Discount_Code'] = df['Discount Code']
- dt = DTClassifier()
- dt_df = dt.Transform(dt.Query(df))
- print("DecisionTree accuracy: {:.1%} ".format(dt.Classify(dt_df)))
- FN, FP = dt.Results()
- print(str(dt.GetROC(dt_df)))
- print("DT False Negative for each Discount Code: 1-{}, 2-{}, 3-{}, 4-{}".format(FN[0], FN[1], FN[2], FN[3]))
- print("DT False Positive for each Discount Code: 1-{}, 2-{}, 3-{}, 4-{}".format(FP[0], FP[1], FP[2], FP[3]))
- nb = NBClassifier()
- print("MultinomialNB accuracy: {:.1%} ".format(nb.Classify(nb.Transform(nb.Query(df)))))
- FN, FP = nb.Results()
- print("NB False Negative for each Discount Code: 1-{}, 2-{}, 3-{}, 4-{}".format(FN[0], FN[1], FN[2], FN[3]))
- print("NB False Positive for each Discount Code: 1-{}, 2-{}, 3-{}, 4-{}".format(FP[0], FP[1], FP[2], FP[3]))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement