Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -------------
- 1.DATE TRANSFORM
- #transform the date string to an actual date
- #set the date as an index, inplace=True means change is applied
- #sort index, so the measurements are sorted by time
- features_df['Datetime'] = pd.to_datetime(features_df['Datetime'])
- features_df.set_index('Datetime',inplace=True)
- features_df.sort_index(inplace=True)
- features_df
- -------------
- 2.MERGING DATASETS
- #merge the two datasets into a new one by their indexes
- df = pd.merge(left=features_df, right=target_df, right_index=True, left_index=True)
- #merge the two datasets into a new one by one of their columns
- df = pd.merge(features_df, target_df, left_on='features_col', right_on='target_col')
- -------------
- 3.INTERPOLATION
- #interpolation as I understand it fills the missing value with the mean of its nearest neigbours.
- #effective when working with temperatures, or something simular in distribution
- for feature in features:
- df[feature] = df[feature].interpolate(method='linear')
- --------------
- 4.GROUP BY 30 MIN
- #so we have rows separated by 10 minutes, and in the new data set we want to kind of group 3 as 1, with their average values.
- df = df.groupby(pd.Grouper(freq="30min")).mean()
- df
- ----------------
- 5.CORRELATION MATRIX
- corr_matrix = df.corr()
- plt.figure(figsize=(12,8))
- sns.heatmap(corr_matrix,annot=True)
- plt.show()
- ---------------
- 6.LINEPLOT
- #when our index is a date, we can line plot a column to see how her values change compared to the date
- sns.lineplot(df['Temperature'])
- --------------
- 7.LAG CREATION
- #for each column and for the give raneg, create lags, and then drop the rows with missing values.
- #this time I included every column, but still not 100% sure how it works.
- for col in df.columns:
- for lag in range (1,6):
- df[f'{col}_{lag}'] = df[col].shift(lag)
- df.dropna(axis=0,inplace=True)
- ------------------
- 8.TRAIN TEST SPLIT
- #in X goes everything except what we train and in y goes the target column
- #this time we only left the lag columns, not sure why and when does that occur.
- from sklearn.model_selection import train_test_split
- X = df.drop('PowerConsumption',axis=1)
- y = df['PowerConsumption']
- X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=False)
- ---------------------
- 9.SCALING
- #Always fit_transform only the train data, and transform only the test data.
- #However, if its categorical output, it will already be Label Encoded, so it doesn't need any transform.
- #Because train_y is 1D, it needs to be reshaped so thta MinMaxScaler can work.
- scaler = MinMaxScaler()
- X_train = scaler.fit_transform(X_train)
- X_test = scaler.transform(X_test)
- scaler_y = MinMaxScaler()
- y_train = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
- y_test = scaler_y.transform(y_test.values.reshape(-1, 1))
- ----------------------
- 10. RESHAPE BEFORE LSTM (samples,lags,features)
- #reshape with: number of rows, number of lags per feature, number of features lagged
- X_train = X_train.reshape((X_train.shape[0], 5, (X_train.shape[1] // lag)))
- X_test = X_test.reshape((X_test.shape[0], 5, (X_test.shape[1] // lag)))
- #one more way I suppose it can be done.
- import numpy as np
- X_train = np.expand_dims(X_train.values, axis=-1)
- X_test = np.expand_dims(X_test.values, axis=-1)
- -------------------------
- 11. LSTM Model DEFINITION
- #so apparently, only input we only specify number of lags and columns,
- #then LSTM layers are following with number of neurons, activation function and usually the first one has return_sequences=True
- #it finishes with a Dense layer, that always has 1 as for Regression and activation='linear' means print the actual value
- #this is different for Classification problems.
- model = Sequential([
- Input((X_train.shape[1], X_train.shape[2],)),
- LSTM(64, activation="relu", return_sequences=True),
- LSTM(32, activation="relu"),
- Dense(1, activation="linear")
- ])
- ------------------
- 12. MODEL COMPILE
- #mandatory after the model definition, adam is always there, but if there is classification, change the loss function and metrics.
- model.compile(
- loss="mean_squared_error",
- optimizer="adam",
- metrics=["mean_squared_error"],
- )
- ---------------
- 13. MODEL TRAIN
- # yeah and the result is usually named history
- history = model.fit(train_X, train_y, validation_split=0.20, epochs=16, batch_size=64, shuffle=False)
- ------------------------
- 14. PLOT LOSS FUNCTION
- sns.lineplot(history.history["loss"], label="loss")
- sns.lineplot(history.history["val_loss"], label="val_loss")
- ------------------------
- 15. MODEL PREDICTION
- y_pred = model.predict(X_test)
- -------------------------
- 16. INVERSE SCALE TRANSFORM
- #Because the model is trained on scaled X_train and y_train data, y_pred is scaled and we wan't to reverse it.
- y_pred = scaler.inverse_transform(y_pred)
- ---------------------------------
- 17. EVALUATION METRIC
- #for regression use MAE,MSE,R2, whereas for classification use accuracy, recall, F1 etc.
- mae = mean_absolute_error(y_test, y_pred)
- mse = mean_squared_error(y_test, y_pred)
- r2 = r2_score(y_test, y_pred)
- print(f"MAE:{mae}")
- print(f"MSE:{mse}")
- print(f"R2:{r2}")
- --------------------------
- 18.XGB MODEL
- #when using an xgb_model, don't fit_transform the y_test, and don't reshape as XGBoost is 2D.
- xgb_model = XGBRegressor(n_estimators=30).fit(X_train, y_train)
- y_pred = xgb_model.predict(X_test)
- mae = mean_absolute_error(y_test,y_pred)
- mse = mean_squared_error(y_test,y_pred)
- r2 = r2_score(y_test,y_pred)
- ---------------
- 19.GRID SEARCH
- grid_search = GridSearchCV(
- estimator=XGBRegressor(),
- param_grid={
- "n_estimators": [15, 20, 25, 30, 35, 40],
- "max_depth": [2, 3, 4, 5, 6, 7]
- },
- cv=TimeSeriesSplit(n_splits=5)
- )
- grid_search.fit(train_X, train_y)
- #calculates which are the best params of the ones given
- grid_search.best_params_
- #you create a new model with the best parameters and evaluate it
- sns.lineplot(x=test_y.index, y=test_y.values,color='red')
- sns.lineplot(x=test_y.index, y=pred_y,color='green')
- ------------------------
- 20. ? IN DATASET
- #replace ? with NaN so you can handle it after
- df.replace('?', np.nan, inplace=True)
- ------------------------
- 21. CHECK COLUMN TYPES
- df.dtypes
- --------------------
- 22. SELECT ONLY CATEGORICAL COLUMNS
- categorical_cols = [col for col in df.columns if df[col].dtype=="object"]
- --------------------
- 23. CONVERT COLUMN VALUES FROM STRING TO NUMERIC
- #errors='coerce' means strings that cannot be converted will be replaced with NaN
- df['FastCharge_KmH'] = pd.to_numeric(df['FastCharge_KmH'],errors='coerce')
- ---------------------
- 24. HISTPLOT TO SEE VALUE DISTRIBUTION FOR ONE COLUMN
- plt.figure(figsize=(12,6))
- sns.histplot(df['FastCharge_KmH'])
- plt.show()
- -----------------------
- 25. LABEL ENCODER
- categorical_cols = [col for col in df.columns if df[col].dtype=="object"]
- for col in categorical_cols:
- le = LabelEncoder()
- df[col] = le.fit_transform(df[col])
- -----------------------
- 26. FILLNA WITH MODE,MEDIAN,MEAN...
- df['FastCharge_KmH'].fillna(df['FastCharge_KmH'].mode()[0],inplace=True)
- -----------------------
- 27. PREDICT ONLY ONE ROW
- #valjda ova namesto X_test?
- df[df[kolonaID] == 23 ]
- #or show how many '?' each column has
- for col in df.columns:
- print(f"{col} : {df[df[col] == '?'].shape[0]}")
- ------------------------
- 28. PLOT Y_PRED VS Y_TEST REGRESSION
- plt.plot(y_test.values, label='Actual', color='red')
- plt.plot(y_pred, label='XGB Prediction', color='green')
- plt.show()
- 29. RELABEL DATA
- def relabelData(annotation):
- if annotation == 'N':
- return 'Normal Beat'
- elif annotation == 'V':
- return 'Ventricular Beat'
- else:
- return 'Other Beats'
- df['Relabeled Beat'] = df['Beat Annotation'].apply(relabelData)
- df['Relabeled Beat'].value_counts()
- --------------------------
- 30. ONE HOT ENCODING
- df = pd.get_dummies(df, columns=['Relabeled Beat', 'Relabeled Episode'], drop_first=True)
- #so that they are not True/False, but rather 0/1?
- X_train = X_train.astype(np.float32)
- X_test = X_test.astype(np.float32)
- ----------------------
- 31. SIMPLE IMPUTER
- #when there is categorical data and KNN IMPUTER is not available:
- cat_imputer = SimpleImputer(strategy='most_frequent')
- df['workclass'] = cat_imputer.fit_transform(df[['workclass']]).ravel()
- df['native.country'] = cat_imputer.fit_transform(df[['native.country']]).ravel()
- df['occupation'] = cat_imputer.fit_transform(df[['occupation']]).ravel()
- ---------------------
- 32. FILTER THROUGH COLUMN
- df_filtered = df[(df['age'] > 40) & (df['age'] <= 50)]
- --------------------
- 33. BOXPLOT COMPARING COLUMNS VALUES
- --------------------------
- sns.boxplot(x='income', y='age', data=df_filtered)
- plt.title('Age distribution across Income groups')
- plt.show()
- ----------------------
- 34.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement