Untitled

'''
FIRST SUBMISSION
ACC 0.88864
'''
import sys
import numpy as np
import pandas as pd
import keras
from keras.layers import Dense, Input,Dropout
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
import seaborn as sns
import matplotlib.pyplot as plt
import math


def loadCVS(pd_dataName):
    pd_dataName = str(pd_dataName)

    pd_data = pd.read_csv(pd_dataName)

    # fill nan values
    pd_data.replace(np.nan, 0)

    # fill null values
    print(np.sum(pd_data.isnull()))
    pd_data = pd_data.fillna(0)

    # clip from 0
    pd_data['Homepage _Duration'] = pd_data['Homepage _Duration'].clip(lower=0.01)
    pd_data['Homepage _Duration']= np.log(pd_data['Homepage _Duration'])
    pd_data['Aboutus_Duration'] = pd_data['Aboutus_Duration'].clip(lower=0.01)
    pd_data['Aboutus_Duration'] = np.log(pd_data['Aboutus_Duration'])
    pd_data['Contactus_Duration'] = pd_data['Contactus_Duration'].clip(lower=0.01)
    pd_data['Contactus_Duration'] = np.log(pd_data['Contactus_Duration'])


    # remap strings
    month_mapping = {'Jan': 0, 'Feb': 1, 'Mar': 2, 'Apr': 3, 'May': 4, 'Aug': 7, 'Dec': 11, 'Jul': 6, 'June': 5,
                     'Nov': 10, 'Oct': 9, 'Sep': 8}
    pd_data['Month'] = pd_data['Month'].apply(lambda s: month_mapping.get(s) if s in month_mapping else s).astype('int')

    # pd_data['Month']=pd_data['Month'].astype('category')

    vt_mapping = {'Returning_Visitor': 0, 'New_Visitor': 1, 'Other': 2}
    pd_data['VisitorType'] = pd_data['VisitorType'].apply(lambda s: vt_mapping.get(s) if s in vt_mapping else s).astype(
        'int')

    # remap bool
    pd_data['Weekend'] = pd_data['Weekend'].astype('int')

    pd_data = pd_data.astype('float64')

    # print(pd_data.describe())
    print(pd_data.info())

    if 'ID' in pd_data.columns:  # test csv
        Y = None
        pd_data = pd_data.drop('ID', axis=1)
    else:  # train csv
        Y = pd_data['Revenue'].values.reshape((-1, 1))
        Y = to_categorical(Y)
        # pairvise plot
        # sns.pairplot(pd_data,hue='Revenue', vars=['Homepage','Aboutus','Month','TrafficType'])
        # plt.show()

        pd_data = pd_data.drop('Revenue', axis=1)


    X = pd_data.values
    print("X shape before deleting",X.shape)
    xTempMonth=X[:,10]
    np.delete(X,10,axis=0)
    X= (X-np.mean(X))/np.std(X)
    print("X shape after deleting",X.shape)


    xTempMonth=to_categorical(xTempMonth)
    print(X.shape,xTempMonth.shape)
    X=np.concatenate((X,xTempMonth),axis=1)
    print("X shape after concatenating",X.shape)

    # normalize  features
    # pd_data = (pd_data - pd_data.mean()) / pd_data.std()

    # get np array from dataframe

    return X, Y


def nn1():
    mod = Sequential()
    mod.add(Dense(40,activation='sigmoid'))
    # mod.add(Dropout(0.5))
    mod.add(Dense(20,activation='sigmoid'))
    # mod.add(Dropout(0.5))
    mod.add(Dense(2,activation='softmax'))

    return mod


if __name__ == '__main__':
    print("python approach3.py trainSet.csv xTest.csv outpd_data.csv")
    print("python approach3.py [1]          [2]         [3]")
    xTrain, yTrain = loadCVS(sys.argv[1])
    xTest, _ = loadCVS(sys.argv[2])

    print(xTrain.shape, yTrain.shape)

    N=yTrain.shape[0]

    for n in range(N-1,-1,-1):
        # print(n,yTrain[n],yTrain.shape)
        if yTrain[n,0]==1:
            xTrain=np.append(xTrain, np.reshape(xTrain[n],(1,xTrain[n].shape[0])) ,axis=0)
            yTrain=np.append(yTrain, np.reshape(yTrain[n],(1,yTrain[n].shape[0])) ,axis=0)

    a=np.arange(xTrain.shape[0])
    np.random.shuffle(a)
    print(a.shape)
    xTrain=xTrain[a]
    yTrain=yTrain[a]

    print(xTrain.shape, yTrain.shape)
    input("Okay?")

    m = nn1()
    m.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

    filepath = "weights.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode=',min')
    # tensorboard = TensorBoard(log_dir='./logs', write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, embeddings_data=None, update_freq='epoch')
    earlystopping = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=100, verbose=0, mode='auto',
                                  baseline=None, restore_best_weights=True)

    callbacks_list = [checkpoint, earlystopping]
    hist = m.fit(xTrain, yTrain, validation_split=0.1, epochs=1000, callbacks=callbacks_list,shuffle=True)

    plt.subplot(1, 2, 1)
    plt.plot(hist.history['acc'])
    plt.xlabel('Epoch')
    plt.ylabel("Accuracy")

    plt.subplot(1, 2, 2)
    plt.plot(hist.history['loss'])
    plt.xlabel('Epoch')
    plt.ylabel("Loss")

    plt.show()

    print(m.summary())
    # save model to JSON
    model_json = m.to_json()
    with open("model.json", "w") as json_file:
        json_file.write(model_json)
        print("Model saved to json.")

    # predicting
    yTest = m.predict(xTest)
    yTest = np.argmax(yTest, axis=1)

    pd_test = pd.DataFrame(columns=['ID', 'Revenue'])
    pd_test['ID'] = pd.Series(np.arange(1, len(yTest) + 1))
    pd_test['Revenue'] = pd.Series(yTest)
    pd_test.to_csv(sys.argv[3], index=False)