Fenny_Theo

<Type1>540_test Data Preprocessing

Jul 22nd, 2021
855
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import pandas as pd
  2. import numpy as np
  3. import h5py
  4. import matplotlib
  5. import matplotlib.pyplot as plt
  6. from keras.models import Model, Sequential
  7. from keras.layers import Dense, Input, concatenate,Conv1D, Conv2D, MaxPooling2D, Conv2DTranspose,MaxPooling1D, Cropping2D, Multiply, subtract, Flatten, Reshape, Permute, LSTM, TimeDistributed,Dropout,BatchNormalization,UpSampling1D
  8. from keras.optimizers import SGD
  9. from keras.callbacks import ModelCheckpoint,EarlyStopping
  10. import tensorflow as tf
  11. from keras.models import load_model
  12. from keras import optimizers
  13. from keras import regularizers
  14. from math import sqrt
  15. from sklearn.metrics import mean_squared_error
  16. from sklearn.metrics import mean_absolute_error
  17. import pickle
  18. from sklearn.preprocessing import StandardScaler
  19. import time
  20. from sklearn import preprocessing
  21. from numpy import argmax
  22. from keras.utils import to_categorical
  23. from tabulate import tabulate
  24. from numpy import array
  25.  
  26.  
  27.  
  28. #implement the data
  29. file_540=open("D:\\NCSU Research\\GEARS_Jul9\\GEARS\\540\\x_test_540.pkl","rb")
  30. data = pickle.load(file_540)
  31.  
  32. ##判断是否有缺失值
  33. #print(data.isnull().sum()) 有且某几个feature超过一半
  34.  
  35.  
  36.  
  37. ##统计缺失值情况
  38. def collect_nan_val(dataframe):
  39.     return dataframe.isna().sum() / dataframe.shape[0]*100
  40.  
  41. print(collect_nan_val(data))
  42.  
  43.  
  44. data=data.fillna(data.median(),inplace=False)
  45. #print(data_complete)
  46.  
  47. data["label"]=pd.cut(data["glucose level"], bins=[0,70,130,10000], labels=[-1,0,1])
  48.  
  49. data_without=data.drop(labels=["time","glucose level"],axis=1,inplace=False)
  50.  
  51. data_0=data_without
  52. data_0=data_0.reset_index(drop=True)
  53.  
  54.  
  55. data_1=data_without.drop(data.index[0])
  56. data_1.columns=["fingerstick_1", "basis_gsr_1", "basis_skintem_1", "acceleration_1", "bolus_1", "basal_1", "meal_1", "basis_sleep_1", "label_1"]
  57. data_1=data_1.reset_index(drop=True) #after reseting will give a new dataframe
  58. #print(data_1)
  59. data_2=data_1.drop(data_1.index[0])
  60. data_2.columns=["fingerstick_2", "basis_gsr_2", "basis_skintem_2", "acceleration_2", "bolus_2", "basal_2", "meal_2", "basis_sleep_2", "label_2"]
  61. data_2=data_2.reset_index(drop=True)
  62.  
  63. data_3=data_2.drop(data_2.index[0])
  64. data_3.columns=["fingerstick_3", "basis_gsr_3", "basis_skintem_3", "acceleration_3", "bolus_3", "basal_3", "meal_3", "basis_sleep_3", "label_3"]
  65. data_3=data_3.reset_index(drop=True)
  66.  
  67. data_4=data_3.drop(data_3.index[0])
  68. data_4.columns=["fingerstick_4", "basis_gsr_4", "basis_skintem_4", "acceleration_4", "bolus_4", "basal_4", "meal_4", "basis_sleep_4", "label_4"]
  69. data_4=data_4.reset_index(drop=True)
  70.  
  71. data_5=data_4.drop(data_4.index[0])
  72. data_5.columns=["fingerstick_5", "basis_gsr_5", "basis_skintem_5", "acceleration_5", "bolus_5", "basal_5", "meal_5", "basis_sleep_5", "label_5"]
  73. data_5=data_5.reset_index(drop=True)
  74.  
  75. data_6=data_5.drop(data_5.index[0])#though index is always 0, the output for data_6 is began at 6
  76. data_6.columns=["fingerstick_6", "basis_gsr_6", "basis_skintem_6", "acceleration_6", "bolus_6", "basal_6", "meal_6", "basis_sleep_6", "label_6"]
  77. data_6=data_6.reset_index(drop=True)
  78.  
  79.  
  80. data_new=pd.concat([data_0,data_1,data_2,data_3,data_4,data_5,data_6],axis=1)
  81. #print(data_new.dropna())
  82. data_new=data_new.dropna()#cut the line that include "NAN"
  83.  
  84.  
  85. #get the new label for the new data set
  86. i=0
  87. len_label=len(data_new)
  88. label_new=[]
  89. searching_list=data_6["label_6"]
  90. #print(type(searching_list))
  91. searching_list=searching_list.tolist()
  92. print(len(searching_list)) #type: list len:13103
  93. #for i in range(3):
  94. #    print(type(int((data.loc[str(i)]["label"]))))==int
  95. #print(len_label)=13103
  96.  
  97.  
  98.  
  99.  
  100. while i<=len(searching_list)-7:#len_label
  101.     j=0
  102.     label_list=[]
  103.     g=i
  104.     w=0
  105.     for j in range(7): #[0,6]
  106.         k=searching_list[g]
  107.         label_list.append(int(k))
  108.         g=g+1
  109.     #print(label_list)
  110.     for m in label_list:#未考虑-1,1在30分钟内同时出现的情况!!!
  111.         if m>0:
  112.             w=w+1
  113.         elif m==0:
  114.             w=w
  115.         else:
  116.             w=w-1
  117.     #print(w)
  118.     if w>0:
  119.         label_new.append(1)
  120.     elif w==0:
  121.         label_new.append(0)
  122.     else:
  123.         label_new.append(-1)
  124.     i=i+1
  125. #print(len(label_new))#=13097
  126.  
  127. #change the lbael list into dataframe and combine with the data_new
  128. from pandas.core.frame import DataFrame
  129. n={"label_new": label_new}
  130. data_label=DataFrame(n)
  131. data_complete=pd.concat([data_new,data_label],axis=1)
  132. data_complete=data_complete.dropna()#cut the line that include "NAN"
  133.  
  134. #count label
  135. #list_c=data_complete["label"].values.tolist()
  136. #print(dict(zip(*np.unique(list_c, return_counts=True)))) {-1: 142, 0: 1083, 1: 1835}
  137.  
  138. data_complete.to_csv("D:\\NCSU Research\\GEARS_Jul9\\GEARS\\540\\data_test.complete.csv")
  139.  
  140.  
  141.  
  142.    
  143.  
  144.  
  145.  
  146.  
  147.  
  148.  
RAW Paste Data