Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """ Author: Johnny (Shaun) Lowis, for Bodeker Scientific.
- Using a multivariate linear regression model to error correct ODIN's and ES642's
- using the Woolston TEOM as a training set.
- """
- import netCDF4 as nc
- import numpy as np
- import pandas as pd
- from users.sl_scripts.MAPM.TEOM.plot_hourly_means import plot_data
- from users.sl_scripts.Regression.multivariate_regress_sample import regress
- from users.sl_scripts.Regression.multivariate_regress_sample import error
- from sklearn import preprocessing
- class LeroyGression:
- def __init__(self, X_train_fp, y_train_fp, X_test_fp, X_train_variables, X_test_variables,
- regressor=0, y_scaler=0, plot=False, error_out=False):
- self.X_train_fp = X_train_fp
- self.y_train_fp = y_train_fp
- self.X_test_fp = X_test_fp
- self.X_train_variables = X_train_variables
- self.X_test_variables = X_test_variables
- self.plot = plot
- self.error_out = error_out
- self.regressor = regressor
- self.y_scaler = y_scaler
- self._process_data()
- self._scale_data()
- def _process_data(self):
- data_TEOM = nc.Dataset(self.y_train_fp)
- teom_time = nc.num2date(data_TEOM.variables["time"][:],
- data_TEOM.variables['time'].units)
- teom_pm = data_TEOM.variables['pm2.5'][:]
- data_TEOM.close()
- data_642 = nc.Dataset(self.X_train_fp)
- ES642_time = nc.num2date(data_642.variables["time"][:], data_642.variables['time'].units)
- time_values, teom_idxs, es642_idxs = np.intersect1d(teom_time, ES642_time, return_indices=True)
- ES642_vars = process_vars(self.X_train_variables, data_642)
- shifted_ES642_vars = []
- for var in ES642_vars:
- shifted_var = var[es642_idxs]
- shifted_ES642_vars.append(shifted_var)
- shifted_teom_time = teom_time[teom_idxs]
- shifted_teom_pm = teom_pm[teom_idxs]
- shifted_ES642_vars_df = pd.DataFrame(shifted_ES642_vars)
- transposed_ES642_vars_df = shifted_ES642_vars_df.transpose()
- shifted_teom_pm_df = pd.DataFrame(shifted_teom_pm)
- return shifted_teom_time, transposed_ES642_vars_df, shifted_teom_pm_df
- def _scale_data(self):
- shifted_teom_time, transposed_ES642_vars_df, shifted_teom_pm_df = self._process_data()
- x_scaler = preprocessing.StandardScaler()
- x_scaler.fit(transposed_ES642_vars_df)
- X_train = x_scaler.transform(transposed_ES642_vars_df)
- self.y_scaler = preprocessing.StandardScaler()
- self.y_scaler.fit(shifted_teom_pm_df)
- y_train = self.y_scaler.transform(shifted_teom_pm_df)
- y_pred, self.regressor = regress(X_train, y_train)
- data_X_test = nc.Dataset(self.X_test_fp)
- X_test_vars = pd.DataFrame(process_vars(self.X_test_variables, data_X_test)).transpose()
- X_test = x_scaler.transform(X_test_vars)
- y_pred, self.regressor = regress(X_test, y_train)
- if self.plot is True:
- plot_data(shifted_teom_time, y_train, shifted_teom_time, y_pred)
- if self.error_out is True:
- error(y_train, y_pred)
- def process_vars(vars_in, data):
- outlist = []
- for key in vars_in:
- if "**" in key.strip():
- key = key[:key.index("**")].strip()
- ES642_var = data.variables[key][:]
- outlist.append(ES642_var ** 2)
- else:
- ES642_var = data.variables[key][:]
- outlist.append(ES642_var)
- return outlist
- def main():
- fp_TEOM = r"/mnt/storage/Scratch/Shaun/working_dir/MAPM/TEOM/Colocation_1/Raw/NetCDF" \
- r"/TEOM_Woolston_Colocation_1_raw.nc"
- fp_642_hours = r"/mnt/temp/Projects/MAPM/Data_Permanent/MAPM_campaign/ES642/Colocation_1/Averaged/NetCDF/ES" \
- r"-642_DM2_Christchurch2019_Colocation_1_averaged.nc"
- fp_642_minutes = r"/mnt/storage/Scratch/Shaun/working_dir/MAPM/ES642/Colocation_1/Raw/NetCDF/ES" \
- r"-642_DM2_Christchurch2019_Colocation_1_raw.nc"
- keys = ['pm2.5', 'air_pressure', 'air_temperature', 'relative_humidity', 'pm2.5 ** 2']
- regress_class = LeroyGression(fp_642_hours, fp_TEOM, fp_642_minutes, keys, keys, plot=True, error_out=True)
- y_scaler, regressor = regress_class.y_scaler, regress_class.regressor
- print(y_scaler, regressor)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement