Fenny_Theo

<Type2>540_train Data Preprocessing

Jul 22nd, 2021
1,181
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import pandas as pd
  2. import numpy as np
  3. import h5py
  4. import matplotlib
  5. #matplotlib.use('agg')
  6. import matplotlib.pyplot as plt
  7. from keras.models import Model, Sequential
  8. from keras.layers import Dense, Input, concatenate,Conv1D, Conv2D, MaxPooling2D, Conv2DTranspose,MaxPooling1D, Cropping2D, Multiply, subtract, Flatten, Reshape, Permute, LSTM, TimeDistributed,Dropout,BatchNormalization,UpSampling1D
  9. from keras.optimizers import SGD
  10. from keras.callbacks import ModelCheckpoint,EarlyStopping
  11. import tensorflow as tf
  12. from keras.models import load_model
  13. from keras import optimizers
  14. from keras import regularizers
  15. from math import sqrt
  16. from sklearn.metrics import mean_squared_error
  17. from sklearn.metrics import mean_absolute_error
  18. import pickle
  19. from sklearn.preprocessing import StandardScaler
  20. import time
  21. from sklearn import preprocessing
  22. from numpy import argmax
  23. from keras.utils import to_categorical
  24. from tabulate import tabulate
  25. from numpy import array
  26. from imblearn.over_sampling import SMOTE
  27. from imblearn.combine import SMOTEENN
  28. from collections import Counter
  29.  
  30. #import the data
  31. #path="D:\\NCSU Research\\GEARS_Jul9\GEARS\\540\\x_train_540_final.pkl"
  32. #data=np.load(path, allow_pickle=True) #1. path中\改为\\; 2. 加allow_pinckle=True
  33.  
  34. file_540=open("D:\\NCSU Research\\GEARS_Jul9\GEARS\\540\\x_train_540_final.pkl","rb")
  35. data = pickle.load(file_540)
  36. #print(data.isnull().any()) all false
  37.  
  38.  
  39. #print(data)
  40. #print(type(data))=<class 'pandas.core.frame.DataFrame'>
  41. #same as using pd.load
  42.  
  43. #prepare data: (Emaily)prepare_data*: load in data with selected features, forward filling the missing value, and prepare the train, valid, test series for model usage
  44. #feature selection(why: can check it later)
  45. #pick_index=['glucose level','fingerstick','basis_gsr','basis_skintem','acceleration','basal','bolus','meal']
  46.  
  47. #make the data balenced
  48. #add label to data
  49. data["label"]=pd.cut(data["glucose level"], bins=[0,70,130,10000], labels=[-1,0,1])
  50. #print(data)
  51.  
  52.  
  53. #do balance by the label
  54. #from imblearn.over_sampling import BorderlineSMOT
  55. #切片:分开特征x和目标y
  56. #X,y=data.["time" "fingerstick"  "basis_gsr"  "basis_skintem"  "acceleration"  "bolus"  "basal"  "meal"  "basis_sleep"]
  57. #ros=BorderlineSMOTE(sampling_strategy={-1:3000,0:6000,1:})
  58. #First I hope to use the pakage directly for resampleing
  59. #However, I realize that there's a time series, which makes the order of data also important
  60. #Therefore, I attempt to first rearange the data to run away from time series
  61. #Or there's another way to deal with the imbalanced data which is to modify the loss function
  62.  
  63. #rearrange the data
  64. #to delet the first, second, third...and combine them together
  65. data_without=data.drop(labels=["time","glucose level"],axis=1,inplace=False)
  66.  
  67. data_0=data_without
  68. data_0=data_0.reset_index(drop=True)
  69.  
  70.  
  71. data_1=data_without.drop(data.index[0])
  72. data_1.columns=["fingerstick_1", "basis_gsr_1", "basis_skintem_1", "acceleration_1", "bolus_1", "basal_1", "meal_1", "basis_sleep_1", "label_1"]
  73. data_1=data_1.reset_index(drop=True) #after reseting will give a new dataframe
  74. #print(data_1)
  75. data_2=data_1.drop(data_1.index[0])
  76. data_2.columns=["fingerstick_2", "basis_gsr_2", "basis_skintem_2", "acceleration_2", "bolus_2", "basal_2", "meal_2", "basis_sleep_2", "label_2"]
  77. data_2=data_2.reset_index(drop=True)
  78.  
  79. data_3=data_2.drop(data_2.index[0])
  80. data_3.columns=["fingerstick_3", "basis_gsr_3", "basis_skintem_3", "acceleration_3", "bolus_3", "basal_3", "meal_3", "basis_sleep_3", "label_3"]
  81. data_3=data_3.reset_index(drop=True)
  82.  
  83. data_4=data_3.drop(data_3.index[0])
  84. data_4.columns=["fingerstick_4", "basis_gsr_4", "basis_skintem_4", "acceleration_4", "bolus_4", "basal_4", "meal_4", "basis_sleep_4", "label_4"]
  85. data_4=data_4.reset_index(drop=True)
  86.  
  87. data_5=data_4.drop(data_4.index[0])
  88. data_5.columns=["fingerstick_5", "basis_gsr_5", "basis_skintem_5", "acceleration_5", "bolus_5", "basal_5", "meal_5", "basis_sleep_5", "label_5"]
  89. data_5=data_5.reset_index(drop=True)
  90.  
  91. data_6=data_5.drop(data_5.index[0])#though index is always 0, the output for data_6 is began at 6
  92. data_6.columns=["fingerstick_6", "basis_gsr_6", "basis_skintem_6", "acceleration_6", "bolus_6", "basal_6", "meal_6", "basis_sleep_6", "label_6"]
  93. data_6=data_6.reset_index(drop=True)
  94.  
  95.  
  96. data_new=pd.concat([data_0,data_1,data_2,data_3,data_4,data_5,data_6],axis=1)
  97. #print(data_new.dropna())
  98. data_new=data_new.dropna()#cut the line that include "NAN"
  99. #print(data_6)
  100. #print(data_new)
  101. #print(data["label"])
  102.  
  103.  
  104.  
  105.  
  106. new_label=data_6["label_6"]
  107. new_label=new_label.tolist()
  108. #print(len(new_label))
  109. #print(type(new_label))
  110. label_new=new_label[7:]
  111.  
  112.  
  113.  
  114. #change the lbael list into dataframe and combine with the data_new
  115. from pandas.core.frame import DataFrame
  116. n={"label_new": label_new}
  117. data_label=DataFrame(n)
  118. data_complete=pd.concat([data_new,data_label],axis=1)
  119. data_complete=data_complete.dropna()#cut the line that include "NAN"
  120.  
  121.  
  122.  
  123. #balanced the data
  124. x_columns = [x for x in data_complete.columns if x not in ["label_new"]]
  125. X=data_complete[x_columns]
  126. y=data_complete["label_new"]
  127. smote_enn = SMOTEENN(random_state=0)
  128.  
  129. #why do following redundant thing because SMOTEENN create NaN if I don't do this
  130. X.to_csv("D:\\NCSU Research\\GEARS_Jul9\\GEARS\\540\\avoid_NaN.csv")
  131. filename="D:\\NCSU Research\\GEARS_Jul9\\GEARS\\540\\avoid_NaN.csv"
  132. X_read=pd.read_csv(filename)
  133. X_df=pd.DataFrame(X_read)
  134.  
  135.  
  136.  
  137.  
  138. X_resampled, y_resampled =smote_enn.fit_resample(X_df,y)
  139. #print(sorted(Counter(y_resampled).items())) #[(-1.0, 5659), (0.0, 3117), (1.0, 3380)]
  140.  
  141. #check whether the NaN exist again
  142. #nan_list = X_resampled.isnull().sum().tolist()#把每一列的空值个数加起来
  143. #print(nan_list)
  144. #print(sum(nan_list))
  145.  
  146.  
  147. #print(X_resampled) #a column with old order exist
  148. X_resampled.drop(labels=["Unnamed: 0"],axis=1,inplace=True)
  149. #print(type(X_resampled)) #dataframe
  150.  
  151.  
  152.  
  153. #standadization
  154. X_scaled=X_resampled.apply(lambda x: (x-np.min(x))/(np.max(x)-np.min(x)))
  155. #can't standadize the label
  156.  
  157. #Try 2
  158. #X_scale=preprocessing.scale(X_resampled)
  159. #print(X_scale)
  160. #X_scaled=pd.DataFrame(X_scale,columns=x_columns)
  161.  
  162.  
  163. #Try3
  164. #colu = [x for x in X_resampled.columns if x not in ["label_new","label","label_1","label_2","label_3","label_4","labe_5","label_6"]]
  165. #print(colu)
  166. #max_min_scaler=lambda x: (x-np.min(x))/(np.max(x)-np.min(x))
  167. #for i in colu:
  168.  #   X_resampled[i] = X_resampled[[i]].apply(max_min_scaler)
  169. #print(X_resampled.mean())
  170.  
  171. data_f=pd.concat([X_scaled,y_resampled],axis=1)
  172.  
  173.  
  174.  
  175.  
  176. #print(data_f)
  177. data_f.to_csv("D:\\NCSU Research\\GEARS_Jul9\\GEARS\\540\\data.complete.csv",index=0)
  178. #PermissionError: [Errno 13] Permission denied: 'D:\\NCSU Research\\GEARS_Jul9\\GEARS\\540\\data.complete.csv'
  179.  
  180.  
  181. #when building RF, mistake like this: Input contains NaN, infinity or a value too large for dtype('float32')
  182. #nan_list = data_f.isnull().sum().tolist()#把每一列的空值个数加起来
  183. #print(nan_list)
  184. #print(sum(nan_list))
  185.  
  186.  
  187.    
  188.  
  189.  
  190.  
  191.  
RAW Paste Data