Kaggle Code 11/02/16

from numpy import loadtxt, zeros, ones, array, linspace, logspace
from pylab import scatter, show, title, xlabel, ylabel, plot, contour
import csv
import pandas as pd
import matplotlib.pyplot as plt
traindf = pd.read_csv("train.csv")
traindf.head(n=10)

descrip = traindf.describe()

Categorical = ['Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_5',
'Product_Info_6', 'Product_Info_7', 'Employment_Info_2', 'Employment_Info_3',
'Employment_Info_5', 'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4',
'InsuredInfo_5', 'InsuredInfo_6', 'InsuredInfo_7', 'Insurance_History_1',
'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4',
'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'Family_Hist_1',
'Medical_History_2', 'Medical_History_3', 'Medical_History_4', 'Medical_History_5',
'Medical_History_6', 'Medical_History_7', 'Medical_History_8', 'Medical_History_9',
'Medical_History_11', 'Medical_History_12', 'Medical_History_13', 'Medical_History_14',
'Medical_History_16', 'Medical_History_17', 'Medical_History_18', 'Medical_History_19',
'Medical_History_20', 'Medical_History_21', 'Medical_History_22', 'Medical_History_23',
'Medical_History_25', 'Medical_History_26', 'Medical_History_27', 'Medical_History_28',
'Medical_History_29', 'Medical_History_30', 'Medical_History_31', 'Medical_History_33',
'Medical_History_34', 'Medical_History_35', 'Medical_History_36', 'Medical_History_37',
'Medical_History_38', 'Medical_History_39', 'Medical_History_40', 'Medical_History_41']

Continuous = ['Product_Info_4', 'Ins_Age', 'Ht', 'Wt', 'BMI', 'Employment_Info_1',
'Employment_Info_4', 'Employment_Info_6', 'Insurance_History_5', 'Family_Hist_2',
'Family_Hist_3', 'Family_Hist_4', 'Family_Hist_5']

Discrete = ['Medical_History_1', 'Medical_History_10', 'Medical_History_15',
'Medical_History_24', 'Medical_History_32']


#Starting with continuous variables, check for truncation and if truncated re-name as discrete variable
#with oldname_t = 1 if variable is greater than mean, and oldname_t = 0 if variable is <= mean
lower_trunc=[]
upper_trunc=[]
Cont = traindf['Id']

for i in Continuous:
    y=traindf[i]
    name = y.name
    Cont[name] = y
    if y.dtype == 'float64':
        if descrip[name]['75%'] == descrip[name]['max']:
            upper_trunc.append(name)
            print upper_trunc + 'upper truncated'
        if descrip[name]['min'] == descrip[name]['25%']:
            lower_trunc.append(name)
            print name + ' lower truncated'


for i in lower_trunc:
    mean = descrip[i]['mean']
    nname = i + '_d'
    Cont[nname] = Cont[i]
    for j in range(0,len(Cont[i])):
        Cont[nname][j] = 0
        if Cont[i][j] > mean:
            Cont[nname][j] = 1

#For categorical variables, determine the number of categories for each variable,
#and create dummy column for each variable category

Cat = traindf['Id']

for i in Categorical:
    y=traindf[i]
    name=y.name
    Cat[name]=y
    Cat_label = Cat[name].unique()