Advertisement
Kiwi_Jenn

Kaggle Code 11/02/16

Feb 10th, 2016
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.06 KB | None | 0 0
  1. from numpy import loadtxt, zeros, ones, array, linspace, logspace
  2. from pylab import scatter, show, title, xlabel, ylabel, plot, contour
  3. import csv
  4. import pandas as pd
  5. import matplotlib.pyplot as plt
  6. traindf = pd.read_csv("train.csv")
  7. traindf.head(n=10)
  8.  
  9. descrip = traindf.describe()
  10.  
  11. Categorical = ['Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_5',
  12. 'Product_Info_6', 'Product_Info_7', 'Employment_Info_2', 'Employment_Info_3',
  13. 'Employment_Info_5', 'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4',
  14. 'InsuredInfo_5', 'InsuredInfo_6', 'InsuredInfo_7', 'Insurance_History_1',
  15. 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4',
  16. 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'Family_Hist_1',
  17. 'Medical_History_2', 'Medical_History_3', 'Medical_History_4', 'Medical_History_5',
  18. 'Medical_History_6', 'Medical_History_7', 'Medical_History_8', 'Medical_History_9',
  19. 'Medical_History_11', 'Medical_History_12', 'Medical_History_13', 'Medical_History_14',
  20. 'Medical_History_16', 'Medical_History_17', 'Medical_History_18', 'Medical_History_19',
  21. 'Medical_History_20', 'Medical_History_21', 'Medical_History_22', 'Medical_History_23',
  22. 'Medical_History_25', 'Medical_History_26', 'Medical_History_27', 'Medical_History_28',
  23. 'Medical_History_29', 'Medical_History_30', 'Medical_History_31', 'Medical_History_33',
  24. 'Medical_History_34', 'Medical_History_35', 'Medical_History_36', 'Medical_History_37',
  25. 'Medical_History_38', 'Medical_History_39', 'Medical_History_40', 'Medical_History_41']
  26.  
  27. Continuous = ['Product_Info_4', 'Ins_Age', 'Ht', 'Wt', 'BMI', 'Employment_Info_1',
  28. 'Employment_Info_4', 'Employment_Info_6', 'Insurance_History_5', 'Family_Hist_2',
  29. 'Family_Hist_3', 'Family_Hist_4', 'Family_Hist_5']
  30.  
  31. Discrete = ['Medical_History_1', 'Medical_History_10', 'Medical_History_15',
  32. 'Medical_History_24', 'Medical_History_32']
  33.  
  34.  
  35.  
  36.  
  37. #Starting with continuous variables, check for truncation and if truncated re-name as discrete variable
  38. #with oldname_t = 1 if variable is greater than mean, and oldname_t = 0 if variable is <= mean
  39. lower_trunc=[]
  40. upper_trunc=[]
  41. Cont = traindf['Id']
  42.  
  43. for i in Continuous:
  44. y=traindf[i]
  45. name = y.name
  46. Cont[name] = y
  47. if y.dtype == 'float64':
  48. if descrip[name]['75%'] == descrip[name]['max']:
  49. upper_trunc.append(name)
  50. print upper_trunc + 'upper truncated'
  51. if descrip[name]['min'] == descrip[name]['25%']:
  52. lower_trunc.append(name)
  53. print name + ' lower truncated'
  54.  
  55.  
  56. for i in lower_trunc:
  57. mean = descrip[i]['mean']
  58. nname = i + '_d'
  59. Cont[nname] = Cont[i]
  60. for j in range(0,len(Cont[i])):
  61. Cont[nname][j] = 0
  62. if Cont[i][j] > mean:
  63. Cont[nname][j] = 1
  64.  
  65. #For categorical variables, determine the number of categories for each variable,
  66. #and create dummy column for each variable category
  67.  
  68. Cat = traindf['Id']
  69.  
  70. for i in Categorical:
  71. y=traindf[i]
  72. name=y.name
  73. Cat[name]=y
  74. Cat_label = Cat[name].unique()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement