Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- '''
- FIRST SUBMISSION
- ACC 0.88864
- '''
- import sys
- import numpy as np
- import pandas as pd
- import keras
- from keras.layers import Dense, Input,Dropout
- from keras.models import Sequential
- from keras.utils import to_categorical
- from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
- import seaborn as sns
- import matplotlib.pyplot as plt
- import math
- def loadCVS(pd_dataName):
- pd_dataName = str(pd_dataName)
- pd_data = pd.read_csv(pd_dataName)
- # fill nan values
- pd_data.replace(np.nan, 0)
- # fill null values
- print(np.sum(pd_data.isnull()))
- pd_data = pd_data.fillna(0)
- # clip from 0
- pd_data['Homepage _Duration'] = pd_data['Homepage _Duration'].clip(lower=0.01)
- pd_data['Homepage _Duration']= np.log(pd_data['Homepage _Duration'])
- pd_data['Aboutus_Duration'] = pd_data['Aboutus_Duration'].clip(lower=0.01)
- pd_data['Aboutus_Duration'] = np.log(pd_data['Aboutus_Duration'])
- pd_data['Contactus_Duration'] = pd_data['Contactus_Duration'].clip(lower=0.01)
- pd_data['Contactus_Duration'] = np.log(pd_data['Contactus_Duration'])
- # remap strings
- month_mapping = {'Jan': 0, 'Feb': 1, 'Mar': 2, 'Apr': 3, 'May': 4, 'Aug': 7, 'Dec': 11, 'Jul': 6, 'June': 5,
- 'Nov': 10, 'Oct': 9, 'Sep': 8}
- pd_data['Month'] = pd_data['Month'].apply(lambda s: month_mapping.get(s) if s in month_mapping else s).astype('int')
- # pd_data['Month']=pd_data['Month'].astype('category')
- vt_mapping = {'Returning_Visitor': 0, 'New_Visitor': 1, 'Other': 2}
- pd_data['VisitorType'] = pd_data['VisitorType'].apply(lambda s: vt_mapping.get(s) if s in vt_mapping else s).astype(
- 'int')
- # remap bool
- pd_data['Weekend'] = pd_data['Weekend'].astype('int')
- pd_data = pd_data.astype('float64')
- # print(pd_data.describe())
- print(pd_data.info())
- if 'ID' in pd_data.columns: # test csv
- Y = None
- pd_data = pd_data.drop('ID', axis=1)
- else: # train csv
- Y = pd_data['Revenue'].values.reshape((-1, 1))
- Y = to_categorical(Y)
- # pairvise plot
- # sns.pairplot(pd_data,hue='Revenue', vars=['Homepage','Aboutus','Month','TrafficType'])
- # plt.show()
- pd_data = pd_data.drop('Revenue', axis=1)
- X = pd_data.values
- print("X shape before deleting",X.shape)
- xTempMonth=X[:,10]
- np.delete(X,10,axis=0)
- X= (X-np.mean(X))/np.std(X)
- print("X shape after deleting",X.shape)
- xTempMonth=to_categorical(xTempMonth)
- print(X.shape,xTempMonth.shape)
- X=np.concatenate((X,xTempMonth),axis=1)
- print("X shape after concatenating",X.shape)
- # normalize features
- # pd_data = (pd_data - pd_data.mean()) / pd_data.std()
- # get np array from dataframe
- return X, Y
- def nn1():
- mod = Sequential()
- mod.add(Dense(40,activation='sigmoid'))
- # mod.add(Dropout(0.5))
- mod.add(Dense(20,activation='sigmoid'))
- # mod.add(Dropout(0.5))
- mod.add(Dense(2,activation='softmax'))
- return mod
- if __name__ == '__main__':
- print("python approach3.py trainSet.csv xTest.csv outpd_data.csv")
- print("python approach3.py [1] [2] [3]")
- xTrain, yTrain = loadCVS(sys.argv[1])
- xTest, _ = loadCVS(sys.argv[2])
- print(xTrain.shape, yTrain.shape)
- N=yTrain.shape[0]
- for n in range(N-1,-1,-1):
- # print(n,yTrain[n],yTrain.shape)
- if yTrain[n,0]==1:
- xTrain=np.append(xTrain, np.reshape(xTrain[n],(1,xTrain[n].shape[0])) ,axis=0)
- yTrain=np.append(yTrain, np.reshape(yTrain[n],(1,yTrain[n].shape[0])) ,axis=0)
- a=np.arange(xTrain.shape[0])
- np.random.shuffle(a)
- print(a.shape)
- xTrain=xTrain[a]
- yTrain=yTrain[a]
- print(xTrain.shape, yTrain.shape)
- input("Okay?")
- m = nn1()
- m.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
- filepath = "weights.hdf5"
- checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode=',min')
- # tensorboard = TensorBoard(log_dir='./logs', write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, embeddings_data=None, update_freq='epoch')
- earlystopping = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=100, verbose=0, mode='auto',
- baseline=None, restore_best_weights=True)
- callbacks_list = [checkpoint, earlystopping]
- hist = m.fit(xTrain, yTrain, validation_split=0.1, epochs=1000, callbacks=callbacks_list,shuffle=True)
- plt.subplot(1, 2, 1)
- plt.plot(hist.history['acc'])
- plt.xlabel('Epoch')
- plt.ylabel("Accuracy")
- plt.subplot(1, 2, 2)
- plt.plot(hist.history['loss'])
- plt.xlabel('Epoch')
- plt.ylabel("Loss")
- plt.show()
- print(m.summary())
- # save model to JSON
- model_json = m.to_json()
- with open("model.json", "w") as json_file:
- json_file.write(model_json)
- print("Model saved to json.")
- # predicting
- yTest = m.predict(xTest)
- yTest = np.argmax(yTest, axis=1)
- pd_test = pd.DataFrame(columns=['ID', 'Revenue'])
- pd_test['ID'] = pd.Series(np.arange(1, len(yTest) + 1))
- pd_test['Revenue'] = pd.Series(yTest)
- pd_test.to_csv(sys.argv[3], index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement