[VNP] A little bit of everything

-------------
1.DATE TRANSFORM
#transform the date string to an actual date
#set the date as an index, inplace=True means change is applied
#sort index, so the measurements are sorted by time
features_df['Datetime'] = pd.to_datetime(features_df['Datetime'])
features_df.set_index('Datetime',inplace=True)
features_df.sort_index(inplace=True)
features_df
-------------
2.MERGING DATASETS
#merge the two datasets into a new one by their indexes
df = pd.merge(left=features_df, right=target_df, right_index=True, left_index=True)
#merge the two datasets into a new one by one of their columns
df = pd.merge(features_df, target_df, left_on='features_col', right_on='target_col')
-------------
3.INTERPOLATION
#interpolation as I understand it fills the missing value with the mean of its nearest neigbours.
#effective when working with temperatures, or something simular in distribution
for feature in features:
  df[feature] = df[feature].interpolate(method='linear')
--------------
4.GROUP BY 30 MIN
#so we have rows separated by 10 minutes, and in the new data set we want to kind of group 3 as 1, with their average values.
df = df.groupby(pd.Grouper(freq="30min")).mean()
df
----------------
5.CORRELATION MATRIX
corr_matrix = df.corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix,annot=True)
plt.show()
---------------
6.LINEPLOT
#when our index is a date, we can line plot a column to see how her values change compared to the date
sns.lineplot(df['Temperature'])
--------------
7.LAG CREATION
#for each column and for the give raneg, create lags, and then drop the rows with missing values.
#this time I included every column, but still not 100% sure how it works.
for col in df.columns:
  for lag in range (1,6):
    df[f'{col}_{lag}'] = df[col].shift(lag)
df.dropna(axis=0,inplace=True)
------------------
8.TRAIN TEST SPLIT
#in X goes everything except what we train and in y goes the target column
#this time we only left the lag columns, not sure why and when does that occur.
from sklearn.model_selection import train_test_split
X = df.drop('PowerConsumption',axis=1)
y = df['PowerConsumption']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=False)
---------------------
9.SCALING
#Always fit_transform only the train data, and transform only the test data.
#However, if its categorical output, it will already be Label Encoded, so it doesn't need any transform.
#Because train_y is 1D, it needs to be reshaped so thta MinMaxScaler can work.
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
scaler_y = MinMaxScaler()
y_train = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test = scaler_y.transform(y_test.values.reshape(-1, 1))
----------------------
10. RESHAPE BEFORE LSTM (samples,lags,features)
#reshape with: number of rows, number of lags per feature, number of features lagged
X_train = X_train.reshape((X_train.shape[0], 5, (X_train.shape[1] // lag)))
X_test = X_test.reshape((X_test.shape[0], 5, (X_test.shape[1] // lag)))
#one more way I suppose it can be done.
import numpy as np
X_train = np.expand_dims(X_train.values, axis=-1)
X_test = np.expand_dims(X_test.values, axis=-1)
-------------------------
11. LSTM Model DEFINITION
#so apparently, only input we only specify number of lags and columns,
#then LSTM layers are following with number of neurons, activation function and usually the first one has return_sequences=True
#it finishes with a Dense layer, that always has 1 as for Regression and activation='linear' means print the actual value
#this is different for Classification problems.
model = Sequential([
    Input((X_train.shape[1], X_train.shape[2],)),
    LSTM(64, activation="relu", return_sequences=True),
    LSTM(32, activation="relu"),
    Dense(1, activation="linear")
])
------------------
12. MODEL COMPILE
#mandatory after the model definition, adam is always there, but if there is classification, change the loss function and metrics.
model.compile(
    loss="mean_squared_error",
    optimizer="adam",
    metrics=["mean_squared_error"],
)
---------------
13. MODEL TRAIN
# yeah and the result is usually named history
history = model.fit(train_X, train_y, validation_split=0.20, epochs=16, batch_size=64, shuffle=False)
------------------------
14. PLOT LOSS FUNCTION
sns.lineplot(history.history["loss"], label="loss")
sns.lineplot(history.history["val_loss"], label="val_loss")
------------------------
15. MODEL PREDICTION
y_pred = model.predict(X_test)
-------------------------
16. INVERSE SCALE TRANSFORM
#Because the model is trained on scaled X_train and y_train data, y_pred is scaled and we wan't to reverse it.
y_pred = scaler.inverse_transform(y_pred)
---------------------------------
17. EVALUATION METRIC
#for regression use MAE,MSE,R2, whereas for classification use accuracy, recall, F1 etc.
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MAE:{mae}")
print(f"MSE:{mse}")
print(f"R2:{r2}")
--------------------------
18.XGB MODEL
#when using an xgb_model, don't fit_transform the y_test, and don't reshape as XGBoost is 2D.
xgb_model = XGBRegressor(n_estimators=30).fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
mae = mean_absolute_error(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)
---------------
19.GRID SEARCH
grid_search = GridSearchCV(
    estimator=XGBRegressor(),
    param_grid={
        "n_estimators": [15, 20, 25, 30, 35, 40],
        "max_depth": [2, 3, 4, 5, 6, 7]
    },
    cv=TimeSeriesSplit(n_splits=5)
)
grid_search.fit(train_X, train_y)
#calculates which are the best params of the ones given
grid_search.best_params_
#you create a new model with the best parameters and evaluate it
sns.lineplot(x=test_y.index, y=test_y.values,color='red')
sns.lineplot(x=test_y.index, y=pred_y,color='green')
------------------------
20. ? IN DATASET
#replace ? with NaN so you can handle it after
df.replace('?', np.nan, inplace=True)
------------------------
21. CHECK COLUMN TYPES
df.dtypes
--------------------
22. SELECT ONLY CATEGORICAL COLUMNS
categorical_cols = [col for col in df.columns if df[col].dtype=="object"]
--------------------
23. CONVERT COLUMN VALUES FROM STRING TO NUMERIC
#errors='coerce' means strings that cannot be converted will be replaced with NaN
df['FastCharge_KmH'] = pd.to_numeric(df['FastCharge_KmH'],errors='coerce')
---------------------
24. HISTPLOT TO SEE VALUE DISTRIBUTION FOR ONE COLUMN
plt.figure(figsize=(12,6))
sns.histplot(df['FastCharge_KmH'])
plt.show()
-----------------------
25. LABEL ENCODER
categorical_cols = [col for col in df.columns if df[col].dtype=="object"]
for col in categorical_cols:
  le = LabelEncoder()
  df[col] = le.fit_transform(df[col])
-----------------------
26. FILLNA WITH MODE,MEDIAN,MEAN...
df['FastCharge_KmH'].fillna(df['FastCharge_KmH'].mode()[0],inplace=True)
-----------------------
27. PREDICT ONLY ONE ROW
#valjda ova namesto X_test?
df[df[kolonaID] == 23 ]
#or show how many '?' each column has
for col in df.columns:
  print(f"{col} : {df[df[col] == '?'].shape[0]}")
------------------------
28. PLOT Y_PRED VS Y_TEST REGRESSION
plt.plot(y_test.values, label='Actual', color='red')
plt.plot(y_pred, label='XGB Prediction', color='green')
plt.show()
29. RELABEL DATA
def relabelData(annotation):
    if annotation == 'N':
        return 'Normal Beat'
    elif annotation == 'V':
        return 'Ventricular Beat'
    else:
        return 'Other Beats'
df['Relabeled Beat'] = df['Beat Annotation'].apply(relabelData)
df['Relabeled Beat'].value_counts()
--------------------------
30. ONE HOT ENCODING
df = pd.get_dummies(df, columns=['Relabeled Beat', 'Relabeled Episode'], drop_first=True)
#so that they are not True/False, but rather 0/1?
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
----------------------
31. SIMPLE IMPUTER
#when there is categorical data and KNN IMPUTER is not available:
cat_imputer = SimpleImputer(strategy='most_frequent')
df['workclass'] = cat_imputer.fit_transform(df[['workclass']]).ravel()
df['native.country'] = cat_imputer.fit_transform(df[['native.country']]).ravel()
df['occupation'] = cat_imputer.fit_transform(df[['occupation']]).ravel()
---------------------
32. FILTER THROUGH COLUMN
df_filtered = df[(df['age'] > 40) & (df['age'] <= 50)]
--------------------
33. BOXPLOT COMPARING COLUMNS VALUES
--------------------------
sns.boxplot(x='income', y='age', data=df_filtered)
plt.title('Age distribution across Income groups')
plt.show()
----------------------
34.