Advertisement
dzocesrce

[VNP] A little bit of everything

May 23rd, 2025 (edited)
193
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.61 KB | None | 0 0
  1. -------------
  2. 1.DATE TRANSFORM
  3. #transform the date string to an actual date
  4. #set the date as an index, inplace=True means change is applied
  5. #sort index, so the measurements are sorted by time
  6. features_df['Datetime'] = pd.to_datetime(features_df['Datetime'])
  7. features_df.set_index('Datetime',inplace=True)
  8. features_df.sort_index(inplace=True)
  9. features_df
  10. -------------
  11. 2.MERGING DATASETS
  12. #merge the two datasets into a new one by their indexes
  13. df = pd.merge(left=features_df, right=target_df, right_index=True, left_index=True)
  14. #merge the two datasets into a new one by one of their columns
  15. df = pd.merge(features_df, target_df, left_on='features_col', right_on='target_col')
  16. -------------
  17. 3.INTERPOLATION
  18. #interpolation as I understand it fills the missing value with the mean of its nearest neigbours.
  19. #effective when working with temperatures, or something simular in distribution
  20. for feature in features:
  21.   df[feature] = df[feature].interpolate(method='linear')
  22. --------------
  23. 4.GROUP BY 30 MIN
  24. #so we have rows separated by 10 minutes, and in the new data set we want to kind of group 3 as 1, with their average values.
  25. df = df.groupby(pd.Grouper(freq="30min")).mean()
  26. df
  27. ----------------
  28. 5.CORRELATION MATRIX
  29. corr_matrix = df.corr()
  30. plt.figure(figsize=(12,8))
  31. sns.heatmap(corr_matrix,annot=True)
  32. plt.show()
  33. ---------------
  34. 6.LINEPLOT
  35. #when our index is a date, we can line plot a column to see how her values change compared to the date
  36. sns.lineplot(df['Temperature'])
  37. --------------
  38. 7.LAG CREATION
  39. #for each column and for the give raneg, create lags, and then drop the rows with missing values.
  40. #this time I included every column, but still not 100% sure how it works.
  41. for col in df.columns:
  42.   for lag in range (1,6):
  43.     df[f'{col}_{lag}'] = df[col].shift(lag)
  44. df.dropna(axis=0,inplace=True)
  45. ------------------
  46. 8.TRAIN TEST SPLIT
  47. #in X goes everything except what we train and in y goes the target column
  48. #this time we only left the lag columns, not sure why and when does that occur.
  49. from sklearn.model_selection import train_test_split
  50. X = df.drop('PowerConsumption',axis=1)
  51. y = df['PowerConsumption']
  52. X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=False)
  53. ---------------------
  54. 9.SCALING
  55. #Always fit_transform only the train data, and transform only the test data.
  56. #However, if its categorical output, it will already be Label Encoded, so it doesn't need any transform.
  57. #Because train_y is 1D, it needs to be reshaped so thta MinMaxScaler can work.
  58. scaler = MinMaxScaler()
  59. X_train = scaler.fit_transform(X_train)
  60. X_test = scaler.transform(X_test)
  61. scaler_y = MinMaxScaler()
  62. y_train = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
  63. y_test = scaler_y.transform(y_test.values.reshape(-1, 1))
  64. ----------------------
  65. 10. RESHAPE BEFORE LSTM (samples,lags,features)
  66. #reshape with: number of rows, number of lags per feature, number of features lagged
  67. X_train = X_train.reshape((X_train.shape[0], 5, (X_train.shape[1] // lag)))
  68. X_test = X_test.reshape((X_test.shape[0], 5, (X_test.shape[1] // lag)))
  69. #one more way I suppose it can be done.
  70. import numpy as np
  71. X_train = np.expand_dims(X_train.values, axis=-1)
  72. X_test = np.expand_dims(X_test.values, axis=-1)
  73. -------------------------
  74. 11. LSTM Model DEFINITION
  75. #so apparently, only input we only specify number of lags and columns,
  76. #then LSTM layers are following with number of neurons, activation function and usually the first one has return_sequences=True
  77. #it finishes with a Dense layer, that always has 1 as for Regression and activation='linear' means print the actual value
  78. #this is different for Classification problems.
  79. model = Sequential([
  80.     Input((X_train.shape[1], X_train.shape[2],)),
  81.     LSTM(64, activation="relu", return_sequences=True),
  82.     LSTM(32, activation="relu"),
  83.     Dense(1, activation="linear")
  84. ])
  85. ------------------
  86. 12. MODEL COMPILE
  87. #mandatory after the model definition, adam is always there, but if there is classification, change the loss function and metrics.
  88. model.compile(
  89.     loss="mean_squared_error",
  90.     optimizer="adam",
  91.     metrics=["mean_squared_error"],
  92. )
  93. ---------------
  94. 13. MODEL TRAIN
  95. # yeah and the result is usually named history
  96. history = model.fit(train_X, train_y, validation_split=0.20, epochs=16, batch_size=64, shuffle=False)
  97. ------------------------
  98. 14. PLOT LOSS FUNCTION
  99. sns.lineplot(history.history["loss"], label="loss")
  100. sns.lineplot(history.history["val_loss"], label="val_loss")
  101. ------------------------
  102. 15. MODEL PREDICTION
  103. y_pred = model.predict(X_test)
  104. -------------------------
  105. 16. INVERSE SCALE TRANSFORM
  106. #Because the model is trained on scaled X_train and y_train data, y_pred is scaled and we wan't to reverse it.
  107. y_pred = scaler.inverse_transform(y_pred)
  108. ---------------------------------
  109. 17. EVALUATION METRIC
  110. #for regression use MAE,MSE,R2, whereas for classification use accuracy, recall, F1 etc.
  111. mae = mean_absolute_error(y_test, y_pred)
  112. mse = mean_squared_error(y_test, y_pred)
  113. r2 = r2_score(y_test, y_pred)
  114. print(f"MAE:{mae}")
  115. print(f"MSE:{mse}")
  116. print(f"R2:{r2}")
  117. --------------------------
  118. 18.XGB MODEL
  119. #when using an xgb_model, don't fit_transform the y_test, and don't reshape as XGBoost is 2D.
  120. xgb_model = XGBRegressor(n_estimators=30).fit(X_train, y_train)
  121. y_pred = xgb_model.predict(X_test)
  122. mae = mean_absolute_error(y_test,y_pred)
  123. mse = mean_squared_error(y_test,y_pred)
  124. r2 = r2_score(y_test,y_pred)
  125. ---------------
  126. 19.GRID SEARCH
  127. grid_search = GridSearchCV(
  128.     estimator=XGBRegressor(),
  129.     param_grid={
  130.         "n_estimators": [15, 20, 25, 30, 35, 40],
  131.         "max_depth": [2, 3, 4, 5, 6, 7]
  132.     },
  133.     cv=TimeSeriesSplit(n_splits=5)
  134. )
  135. grid_search.fit(train_X, train_y)
  136. #calculates which are the best params of the ones given
  137. grid_search.best_params_
  138. #you create a new model with the best parameters and evaluate it
  139. sns.lineplot(x=test_y.index, y=test_y.values,color='red')
  140. sns.lineplot(x=test_y.index, y=pred_y,color='green')
  141. ------------------------
  142. 20. ? IN DATASET
  143. #replace ? with NaN so you can handle it after
  144. df.replace('?', np.nan, inplace=True)
  145. ------------------------
  146. 21. CHECK COLUMN TYPES
  147. df.dtypes
  148. --------------------
  149. 22. SELECT ONLY CATEGORICAL COLUMNS
  150. categorical_cols = [col for col in df.columns if df[col].dtype=="object"]
  151. --------------------
  152. 23. CONVERT COLUMN VALUES FROM STRING TO NUMERIC
  153. #errors='coerce' means strings that cannot be converted will be replaced with NaN
  154. df['FastCharge_KmH'] = pd.to_numeric(df['FastCharge_KmH'],errors='coerce')
  155. ---------------------
  156. 24. HISTPLOT TO SEE VALUE DISTRIBUTION FOR ONE COLUMN
  157. plt.figure(figsize=(12,6))
  158. sns.histplot(df['FastCharge_KmH'])
  159. plt.show()
  160. -----------------------
  161. 25. LABEL ENCODER
  162. categorical_cols = [col for col in df.columns if df[col].dtype=="object"]
  163. for col in categorical_cols:
  164.   le = LabelEncoder()
  165.   df[col] = le.fit_transform(df[col])
  166. -----------------------
  167. 26. FILLNA WITH MODE,MEDIAN,MEAN...
  168. df['FastCharge_KmH'].fillna(df['FastCharge_KmH'].mode()[0],inplace=True)
  169. -----------------------
  170. 27. PREDICT ONLY ONE ROW
  171. #valjda ova namesto X_test?
  172. df[df[kolonaID] == 23 ]
  173. #or show how many '?' each column has
  174. for col in df.columns:
  175.   print(f"{col} : {df[df[col] == '?'].shape[0]}")
  176. ------------------------
  177. 28. PLOT Y_PRED VS Y_TEST REGRESSION
  178. plt.plot(y_test.values, label='Actual', color='red')
  179. plt.plot(y_pred, label='XGB Prediction', color='green')
  180. plt.show()
  181. 29. RELABEL DATA
  182. def relabelData(annotation):
  183.     if annotation == 'N':
  184.         return 'Normal Beat'
  185.     elif annotation == 'V':
  186.         return 'Ventricular Beat'
  187.     else:
  188.         return 'Other Beats'
  189. df['Relabeled Beat'] = df['Beat Annotation'].apply(relabelData)
  190. df['Relabeled Beat'].value_counts()
  191. --------------------------
  192. 30. ONE HOT ENCODING
  193. df = pd.get_dummies(df, columns=['Relabeled Beat', 'Relabeled Episode'], drop_first=True)
  194. #so that they are not True/False, but rather 0/1?
  195. X_train = X_train.astype(np.float32)
  196. X_test = X_test.astype(np.float32)
  197. ----------------------
  198. 31. SIMPLE IMPUTER
  199. #when there is categorical data and KNN IMPUTER is not available:
  200. cat_imputer = SimpleImputer(strategy='most_frequent')
  201. df['workclass'] = cat_imputer.fit_transform(df[['workclass']]).ravel()
  202. df['native.country'] = cat_imputer.fit_transform(df[['native.country']]).ravel()
  203. df['occupation'] = cat_imputer.fit_transform(df[['occupation']]).ravel()
  204. ---------------------
  205. 32. FILTER THROUGH COLUMN
  206. df_filtered = df[(df['age'] > 40) & (df['age'] <= 50)]
  207. --------------------
  208. 33. BOXPLOT COMPARING COLUMNS VALUES
  209. --------------------------
  210. sns.boxplot(x='income', y='age', data=df_filtered)
  211. plt.title('Age distribution across Income groups')
  212. plt.show()
  213. ----------------------
  214. 34.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement