Fenny_Theo

<Type1>540_train Data Preprocessing

Jul 22nd, 2021
958
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import pandas as pd
  2. import numpy as np
  3. import h5py
  4. import matplotlib
  5. #matplotlib.use('agg')
  6. import matplotlib.pyplot as plt
  7. from keras.models import Model, Sequential
  8. from keras.layers import Dense, Input, concatenate,Conv1D, Conv2D, MaxPooling2D, Conv2DTranspose,MaxPooling1D, Cropping2D, Multiply, subtract, Flatten, Reshape, Permute, LSTM, TimeDistributed,Dropout,BatchNormalization,UpSampling1D
  9. from keras.optimizers import SGD
  10. from keras.callbacks import ModelCheckpoint,EarlyStopping
  11. import tensorflow as tf
  12. from keras.models import load_model
  13. from keras import optimizers
  14. from keras import regularizers
  15. from math import sqrt
  16. from sklearn.metrics import mean_squared_error
  17. from sklearn.metrics import mean_absolute_error
  18. import pickle
  19. from sklearn.preprocessing import StandardScaler
  20. import time
  21. from sklearn import preprocessing
  22. from numpy import argmax
  23. from keras.utils import to_categorical
  24. from tabulate import tabulate
  25. from numpy import array
  26.  
  27.  
  28. #import the data
  29. #path="D:\\NCSU Research\\GEARS_Jul9\GEARS\\540\\x_train_540_final.pkl"
  30. #data=np.load(path, allow_pickle=True) #1. path中\改为\\; 2. 加allow_pinckle=True
  31.  
  32. file_540=open("D:\\NCSU Research\\GEARS_Jul9\GEARS\\540\\x_train_540_final.pkl","rb")
  33. data = pickle.load(file_540)
  34. #print(data.isnull().any()) all false
  35.  
  36.  
  37. #print(data)
  38. #print(type(data))=<class 'pandas.core.frame.DataFrame'>
  39. #same as using pd.load
  40.  
  41. #prepare data: (Emaily)prepare_data*: load in data with selected features, forward filling the missing value, and prepare the train, valid, test series for model usage
  42. #feature selection(why: can check it later)
  43. #pick_index=['glucose level','fingerstick','basis_gsr','basis_skintem','acceleration','basal','bolus','meal']
  44.  
  45. #make the data balenced
  46. #add label to data
  47. data["label"]=pd.cut(data["glucose level"], bins=[0,70,130,10000], labels=[-1,0,1])
  48. #print(data)
  49.  
  50.  
  51. #do balance by the label
  52. #from imblearn.over_sampling import BorderlineSMOT
  53. #切片:分开特征x和目标y
  54. #X,y=data.["time" "fingerstick"  "basis_gsr"  "basis_skintem"  "acceleration"  "bolus"  "basal"  "meal"  "basis_sleep"]
  55. #ros=BorderlineSMOTE(sampling_strategy={-1:3000,0:6000,1:})
  56. #First I hope to use the pakage directly for resampleing
  57. #However, I realize that there's a time series, which makes the order of data also important
  58. #Therefore, I attempt to first rearange the data to run away from time series
  59. #Or there's another way to deal with the imbalanced data which is to modify the loss function
  60.  
  61. #rearrange the data
  62. #to delet the first, second, third...and combine them together
  63. data_without=data.drop(labels=["time","glucose level"],axis=1,inplace=False)
  64.  
  65. data_0=data_without
  66. data_0=data_0.reset_index(drop=True)
  67.  
  68.  
  69. data_1=data_without.drop(data.index[0])
  70. data_1.columns=["fingerstick_1", "basis_gsr_1", "basis_skintem_1", "acceleration_1", "bolus_1", "basal_1", "meal_1", "basis_sleep_1", "label_1"]
  71. data_1=data_1.reset_index(drop=True) #after reseting will give a new dataframe
  72. #print(data_1)
  73. data_2=data_1.drop(data_1.index[0])
  74. data_2.columns=["fingerstick_2", "basis_gsr_2", "basis_skintem_2", "acceleration_2", "bolus_2", "basal_2", "meal_2", "basis_sleep_2", "label_2"]
  75. data_2=data_2.reset_index(drop=True)
  76.  
  77. data_3=data_2.drop(data_2.index[0])
  78. data_3.columns=["fingerstick_3", "basis_gsr_3", "basis_skintem_3", "acceleration_3", "bolus_3", "basal_3", "meal_3", "basis_sleep_3", "label_3"]
  79. data_3=data_3.reset_index(drop=True)
  80.  
  81. data_4=data_3.drop(data_3.index[0])
  82. data_4.columns=["fingerstick_4", "basis_gsr_4", "basis_skintem_4", "acceleration_4", "bolus_4", "basal_4", "meal_4", "basis_sleep_4", "label_4"]
  83. data_4=data_4.reset_index(drop=True)
  84.  
  85. data_5=data_4.drop(data_4.index[0])
  86. data_5.columns=["fingerstick_5", "basis_gsr_5", "basis_skintem_5", "acceleration_5", "bolus_5", "basal_5", "meal_5", "basis_sleep_5", "label_5"]
  87. data_5=data_5.reset_index(drop=True)
  88.  
  89. data_6=data_5.drop(data_5.index[0])#though index is always 0, the output for data_6 is began at 6
  90. data_6.columns=["fingerstick_6", "basis_gsr_6", "basis_skintem_6", "acceleration_6", "bolus_6", "basal_6", "meal_6", "basis_sleep_6", "label_6"]
  91. data_6=data_6.reset_index(drop=True)
  92.  
  93.  
  94. data_new=pd.concat([data_0,data_1,data_2,data_3,data_4,data_5,data_6],axis=1)
  95. #print(data_new.dropna())
  96. data_new=data_new.dropna()#cut the line that include "NAN"
  97. #print(data_6) #13103*9
  98. #print(data_new)#13103*63
  99. #print(data["label"])
  100.  
  101.  
  102.  
  103. #get the new label for the new data set
  104. i=0
  105. len_label=len(data_new)
  106. label_new=[]
  107. searching_list=data_6["label_6"]
  108. #print(type(searching_list))
  109. searching_list=searching_list.tolist()
  110. print(len(searching_list)) #type: list len:13103
  111. #for i in range(3):
  112. #    print(type(int((data.loc[str(i)]["label"]))))==int
  113. #print(len_label)=13103
  114.  
  115.  
  116.  
  117.  
  118. while i<=len(searching_list)-7:#len_label
  119.     j=0
  120.     label_list=[]
  121.     g=i
  122.     w=0
  123.     for j in range(7): #[0,6]
  124.         k=searching_list[g]
  125.         label_list.append(int(k))
  126.         g=g+1
  127.     #print(label_list)
  128.     for m in label_list:#未考虑-1,1在30分钟内同时出现的情况!!!
  129.         if m>0:
  130.             w=w+1
  131.         elif m==0:
  132.             w=w
  133.         else:
  134.             w=w-1
  135.     #print(w)
  136.     if w>0:
  137.         label_new.append(1)
  138.     elif w==0:
  139.         label_new.append(0)
  140.     else:
  141.         label_new.append(-1)
  142.     i=i+1
  143. #print(len(label_new))#=13097
  144.  
  145. #change the lbael list into dataframe and combine with the data_new
  146. from pandas.core.frame import DataFrame
  147. n={"label_new": label_new}
  148. data_label=DataFrame(n)
  149. data_complete=pd.concat([data_new,data_label],axis=1)
  150. data_complete=data_complete.dropna()#cut the line that include "NAN"
  151.  
  152. #print(data_complete)
  153. data_complete.to_csv("D:\\NCSU Research\\GEARS_Jul9\\GEARS\\540\\data.complete.csv")
  154. #PermissionError: [Errno 13] Permission denied: 'D:\\NCSU Research\\GEARS_Jul9\\GEARS\\540\\data.complete.csv'
  155. #can written into excle only after closing the file
  156.  
  157.  
  158.  
  159.    
  160.  
  161.  
  162.  
  163.  
RAW Paste Data