Advertisement
Guest User

Untitled

a guest
Mar 19th, 2018
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.12 KB | None | 0 0
  1.  
  2. # coding: utf-8
  3.  
  4. # In[ ]:
  5.  
  6. '''
  7. import pandas as pd
  8. from datetime import datetime
  9. from pandas import *
  10.  
  11. # Task 1
  12. df = pd.DataFrame.from_csv("hotels_data.csv")
  13.  
  14. # do same but attach it to the dataframe
  15. df['DiscountDiff'] = df.apply(lambda row: row["Original Price"] - row["Discount Price"], axis=1)
  16. df['DayDiff'] = df.apply(lambda row: Timedelta(datetime.strptime(row["Checkin Date"], '%m/%d/%Y %H:%M') - datetime.strptime(row["Snapshot Date"], '%m/%d/%Y %H:%M')).days, axis=1)
  17. df['WeekDay'] = df.apply(lambda row: to_datetime(row["Checkin Date"]).weekday_name, axis=1)
  18. df['DiscountPerc'] = df.apply(lambda row: ((row["DiscountDiff"] * 100) / row["Original Price"]), axis=1)
  19. df.to_csv("Hotels_data_Changed.csv")
  20. '''
  21.  
  22. # In[134]:
  23.  
  24.  
  25. import os
  26. import io
  27. import numpy
  28. from sklearn.feature_extraction.text import CountVectorizer
  29. from sklearn.naive_bayes import MultinomialNB
  30. from scipy import sparse as sp
  31. import pandas as pd
  32. import matplotlib.pyplot as plt
  33. from pandas import *
  34. from sklearn.model_selection import *
  35. from sklearn.feature_extraction.text import CountVectorizer
  36. from sklearn.naive_bayes import MultinomialNB
  37. from sklearn import datasets, linear_model
  38. from sklearn.tree import DecisionTreeClassifier, export_graphviz
  39. from sklearn.metrics import *
  40. from sklearn.preprocessing import label_binarize
  41.  
  42. class NBClassifier():
  43. def Query(self, df):
  44. work_df = pd.DataFrame(df, columns = ["WeekDay", "Snapshot_Date", "Checkin_Date", "DayDiff", "Hotel_Name", "Discount_Code", "DiscountPerc"])
  45. a = work_df.sort_values("DiscountPerc", ascending=False).groupby(by=["WeekDay","Snapshot_Date","Checkin_Date","DayDiff","Hotel_Name"], as_index=True)
  46.  
  47. d = pd.DataFrame(a.size().reset_index(name='counts'))
  48. pp= d[d['counts'] == 4]
  49. new_work_df = pd.merge(work_df, pp, on =["WeekDay", "Snapshot_Date", "Checkin_Date", "DayDiff", "Hotel_Name"], how='inner')
  50. new_work =pd.DataFrame(new_work_df.sort_values(["Snapshot_Date","Checkin_Date","Discount_Code"], ascending=[True, True, True]).reset_index(drop=True))
  51.  
  52. a = new_work.sort_values("DiscountPerc", ascending=False).groupby(by=["WeekDay","Snapshot_Date","Checkin_Date","DayDiff","Hotel_Name"], as_index=False)
  53. d = pd.DataFrame(a["DiscountPerc"].max().reset_index(drop=True))
  54. new_work_df = pd.merge(new_work, d, on =["WeekDay", "Snapshot_Date", "Checkin_Date", "DayDiff", "Hotel_Name", "DiscountPerc"], how='inner')
  55. new_work_df = new_work_df.drop("counts", axis=1).drop("DiscountPerc", axis=1)
  56. new_work =pd.DataFrame(new_work_df.sort_values(["Snapshot_Date","Checkin_Date","Discount_Code"], ascending=[True, True, True]).reset_index(drop=True))
  57. new_work.sort_index(inplace=True)
  58. return new_work
  59.  
  60. def Transform(self, df):
  61. snapshot_dates = pd.to_datetime(df['Snapshot_Date']).dt
  62.  
  63. Snapshot_dayofweek = pd.DataFrame({'Snapshot_dayofweek':snapshot_dates.dayofweek})
  64. Snapshot_dayofmonth = pd.DataFrame({'Snapshot_dayofmonth':snapshot_dates.day})
  65. Snapshot_month = pd.DataFrame({'Snapshot_month':snapshot_dates.month})
  66. Snapshot_year = pd.DataFrame({'Snapshot_year':snapshot_dates.year})
  67. final_df = df.drop('Snapshot_Date',axis=1)
  68. final_df = final_df.join(pd.get_dummies(Snapshot_dayofweek['Snapshot_dayofweek']).add_prefix('Snapshot_dayofweek_'))
  69. final_df = final_df.join(pd.get_dummies(Snapshot_dayofmonth['Snapshot_dayofmonth']).add_prefix('Snapshot_dayofmonth_'))
  70. final_df = final_df.join(pd.get_dummies(Snapshot_month['Snapshot_month']).add_prefix('Snapshot_month_'))
  71. final_df = final_df.join(pd.get_dummies(Snapshot_year['Snapshot_year']).add_prefix('Snapshot_year_'))
  72.  
  73. checkin_dates = pd.to_datetime(final_df['Checkin_Date']).dt
  74. Checkin_dayofweek = pd.DataFrame({'Checkin_dayofweek':checkin_dates.dayofweek})
  75. Checkin_dayofmonth = pd.DataFrame({'Checkin_dayofmonth':checkin_dates.day})
  76. Checkin_month = pd.DataFrame({'Checkin_month':checkin_dates.month})
  77. Checkin_year = pd.DataFrame({'Checkin_year':checkin_dates.year})
  78. final_df = final_df.drop('Checkin_Date',axis=1)
  79. final_df = final_df.drop('WeekDay',axis=1)
  80. final_df = final_df.join(pd.get_dummies(Checkin_dayofweek['Checkin_dayofweek']).add_prefix('Checkin_dayofweek_'))
  81. final_df = final_df.join(pd.get_dummies(Checkin_dayofmonth['Checkin_dayofmonth']).add_prefix('Checkin_dayofmonth_'))
  82. final_df = final_df.join(pd.get_dummies(Checkin_month['Checkin_month']).add_prefix('Checkin_month_'))
  83. final_df = final_df.join(pd.get_dummies(Checkin_year['Checkin_year']).add_prefix('Checkin_year_'))
  84.  
  85. encoded_hotel_names = pd.get_dummies(final_df['Hotel_Name']).add_prefix('Hotel_Name_')
  86. final_df = final_df.drop('Hotel_Name', axis=1)
  87. final_df = final_df.join(encoded_hotel_names)
  88.  
  89. op = list(final_df.columns)
  90.  
  91. # Delete the discountperc column
  92. del op[1]
  93.  
  94. final_df = pd.DataFrame(final_df.drop_duplicates(subset=op).reset_index(drop=True))
  95.  
  96. return final_df
  97.  
  98. def Classify(self, df):
  99. X = df.drop('Discount_Code', axis=1)
  100. Y = df['Discount_Code']
  101.  
  102. X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.1, random_state = 100)
  103.  
  104. # naive bayes
  105. clf = MultinomialNB()
  106. clf.fit(X_train, y_train)
  107. pred = clf.predict(X_test)
  108.  
  109. self.confusion_matrix = confusion_matrix(y_test, pred)
  110. print(self.confusion_matrix)
  111. return accuracy_score(y_test, pred)
  112.  
  113. def Results(self):
  114. # statistical measures
  115. FP = self.confusion_matrix.sum(axis=0) - np.diag(self.confusion_matrix)
  116. FN = self.confusion_matrix.sum(axis=1) - np.diag(self.confusion_matrix)
  117. TP = np.diag(self.confusion_matrix)
  118. TN = self.confusion_matrix.sum() - (FP + FN + TP)
  119.  
  120. # Computing sums:
  121. x = np.array(TP)
  122. tp_sum = np.sum(x)
  123. x = np.array(TN)
  124. tn_sum = np.sum(x)
  125. x = np.array(FP)
  126. fp_sum = np.sum(x)
  127. x = np.array(FN)
  128. fn_sum = np.sum(x)
  129.  
  130. print("The confusion matrix:")
  131. self.confusion_matrix
  132.  
  133. return FN, FP
  134.  
  135. class DTClassifier():
  136. def Query(self, df):
  137.  
  138. #transform data to correct format
  139. a = df.sort_values("DiscountDiff", ascending=False).groupby(by=["Hotel_Name","Checkin_Date", "Snapshot_Date", "DayDiff", "WeekDay"], as_index=True)
  140.  
  141. new_df = pd.DataFrame(a.first().reset_index())
  142. return new_df
  143.  
  144. def Transform(self, df):
  145. all_days = {"Sunday" : 1, "Monday": 2, "Tuesday" : 3, "Wednesday" : 4, "Thursday" : 5, "Friday" : 6, "Saturday" : 7}
  146.  
  147. df['Hotel_Name'] = df["Hotel_Name"].astype('category').cat.codes
  148. df['Checkin_Date'] = df.apply(lambda row: datetime.strptime(row["Checkin_Date"], '%m/%d/%Y %H:%M').timestamp(), axis=1)
  149. df['WeekDay'] = df["WeekDay"].map(all_days)
  150. df['Snapshot_Date'] = df.apply(lambda row: datetime.strptime(row["Snapshot_Date"], '%m/%d/%Y %H:%M').timestamp(), axis=1)
  151. return df
  152.  
  153. def Classify(self, df):
  154. features = ["Checkin_Date", "Snapshot_Date", "Hotel_Name"]
  155. super_new_df = pd.DataFrame(df, columns=features)
  156.  
  157. df_X = super_new_df[features]
  158. df_Y = df["Discount_Code"]
  159.  
  160. X_train, X_test, y_train, y_test = train_test_split( df_X, df_Y, test_size = 0.1, random_state = 100)
  161.  
  162.  
  163. # descision tree
  164. clf = DecisionTreeClassifier(random_state=14)
  165. clf = clf.fit(X_train, y_train)
  166. pred = clf.predict(X_test)
  167.  
  168. self.confusion_matrix = confusion_matrix(y_test, pred)
  169. print(self.confusion_matrix)
  170. return accuracy_score(y_test, pred)
  171.  
  172. def GetROC(self, df):
  173. fpr = dict()
  174. tpr = dict()
  175. roc_auc = dict()
  176.  
  177. features = ["Checkin_Date", "Snapshot_Date", "Hotel_Name"]
  178. super_new_df = pd.DataFrame(df, columns=features)
  179.  
  180. df_X = super_new_df[features]
  181. df_Y = df["Discount_Code"]
  182.  
  183. df_Y = label_binarize(df_Y, classes=[1, 2, 3, 4])
  184.  
  185.  
  186. X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size = 0.3, random_state = 100)
  187. clf = DecisionTreeClassifier(random_state=14)
  188. y_score = clf.fit(X_train, y_train).predict(X_test)
  189.  
  190. #self.confusion_matrix = confusion_matrix(y_test, y_score)
  191.  
  192. for i in range(4):
  193. fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
  194. roc_auc[i] = auc(fpr[i], tpr[i])
  195.  
  196. # Compute micro-average ROC curve and ROC area
  197. fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
  198. roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
  199.  
  200. plt.figure()
  201. lw = 2
  202. plt.plot(fpr[2], tpr[2], color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
  203. plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
  204. plt.xlim([0.0, 1.0])
  205. plt.ylim([0.0, 1.05])
  206. plt.xlabel('False Positive Rate')
  207. plt.ylabel('True Positive Rate')
  208. plt.title('Receiver operating characteristic example')
  209. plt.legend(loc="lower right")
  210. plt.show()
  211.  
  212. return roc_auc
  213.  
  214. def Results(self):
  215. # statistical measures
  216. FP = self.confusion_matrix.sum(axis=0) - np.diag(self.confusion_matrix)
  217. FN = self.confusion_matrix.sum(axis=1) - np.diag(self.confusion_matrix)
  218. TP = np.diag(self.confusion_matrix)
  219. TN = self.confusion_matrix.sum() - (FP + FN + TP)
  220.  
  221. # Computing sums:
  222. x = np.array(TP)
  223. tp_sum = np.sum(x)
  224. x = np.array(TN)
  225. tn_sum = np.sum(x)
  226. x = np.array(FP)
  227. fp_sum = np.sum(x)
  228. x = np.array(FN)
  229. fn_sum = np.sum(x)
  230.  
  231. print("The confusion matrix:")
  232. self.confusion_matrix
  233.  
  234. return FN, FP
  235.  
  236.  
  237. print("Reading from CSV into dataframe.")
  238. df = pd.read_csv("Hotels_data_Changed.csv", encoding = "ISO-8859-1")
  239.  
  240. df['Checkin_Date'] = df['Checkin Date']
  241. df['Snapshot_Date'] = df['Snapshot Date']
  242. df['Hotel_Name'] = df['Hotel Name']
  243. df['Discount_Code'] = df['Discount Code']
  244.  
  245. dt = DTClassifier()
  246. dt_df = dt.Transform(dt.Query(df))
  247.  
  248. print("DecisionTree accuracy: {:.1%} ".format(dt.Classify(dt_df)))
  249.  
  250. FN, FP = dt.Results()
  251.  
  252. print(str(dt.GetROC(dt_df)))
  253.  
  254. print("DT False Negative for each Discount Code: 1-{}, 2-{}, 3-{}, 4-{}".format(FN[0], FN[1], FN[2], FN[3]))
  255. print("DT False Positive for each Discount Code: 1-{}, 2-{}, 3-{}, 4-{}".format(FP[0], FP[1], FP[2], FP[3]))
  256.  
  257. nb = NBClassifier()
  258.  
  259. print("MultinomialNB accuracy: {:.1%} ".format(nb.Classify(nb.Transform(nb.Query(df)))))
  260.  
  261. FN, FP = nb.Results()
  262.  
  263. print("NB False Negative for each Discount Code: 1-{}, 2-{}, 3-{}, 4-{}".format(FN[0], FN[1], FN[2], FN[3]))
  264. print("NB False Positive for each Discount Code: 1-{}, 2-{}, 3-{}, 4-{}".format(FP[0], FP[1], FP[2], FP[3]))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement