Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## Random Forest Regression
- ## A. PREPARING THE DATA
- # Importing the libraries
- import numpy as np
- import matplotlib.pyplot as plt
- import pandas as pd
- import seaborn as sns
- # Importing the dataset and defining the dependent and independent variables
- dataset = pd.read_excel('ET data.clean.xlsx')
- print(dataset.isnull().sum()) #shows missing values for each column
- print(dataset.dtypes) #shows the data type for each column
- dataset.info()
- X = dataset.iloc[:, 7:].values #includes all Xs, numerical and categorical
- Xnumvar = list(dataset.dtypes[dataset.dtypes != "object"].index) #intermediate variable makes a list of the headers of non-object data types (numerical)
- Xnum = dataset[Xnumvar].head(24914) #includes numerical Xs only - no categorical
- Xnum = Xnum.iloc[:,7:].values #excludes the y from the dataset (first 6 columns)
- #untag the following to select 1 specific y to analyse and hashtag the previous y
- #y = dataset.iloc[:, :5].values #all the dependent variables y (6 variables)
- #y = dataset.pop("RE IC per Capita (W/person)")
- y = dataset.pop("RE in TFEC (%)")
- #y = dataset.pop("Access to electricity (%)")
- #y = dataset.pop("Energy use per capita (kgoe)")
- #y = dataset.pop("GHG per capita (tCO2)")
- #y = dataset.pop("Energy Intensity (MJ/$2011 PPP GDP)")
- #y = dataset.pop("CO2 intensity (kg per kg of oil equivalent energy use)")
- #Replacing nan with 0s (this is not ideal, I would like to keep the data gaps, rather than 0s)
- Xnum = np.nan_to_num(Xnum) #replaces all the NaN values in X with 0
- y = np.nan_to_num(y) #replaces all the NaN values in X with 0
- ## B. SPLITTING THE DATA FOR TRAINING A MACHINE LEARNING MODEL
- # Splitting the dataset into the Training set and Test set
- from sklearn.model_selection import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(Xnum, y, test_size = 0.3, random_state = 0)
- ## C. RANDOM FOREST REGRESSION
- # Fitting Random Forest Regression to the dataset
- from sklearn.ensemble import RandomForestRegressor
- regressor = RandomForestRegressor(n_estimators = 100, random_state = 0, oob_score = True)
- regressor.fit(X_train, y_train)
- #Testing the random forest regression's accuracy
- regressor.score(X_test,y_test)
- #Viewing the importance of the Xs
- print(regressor.feature_importances_)
- #Tuning the Hyperparameters of the Random Forest
- start = time.time()
- param_dist = {'n_estimators':[15:500],
- 'max_depth': [2, 3, 4],
- 'bootstrap': [True, False],
- 'max_features': ['auto', 'sqrt', 'log2', None],
- 'criterion': ['gini', 'entropy']}
- cv_rf = GridSearchCV(fit_rf, cv = 10,
- param_grid=param_dist,
- n_jobs = 3)
- cv_rf.fit(training_set, class_set)
- print('Best Parameters using grid search: \n',
- cv_rf.best_params_)
- end = time.time()
- print('Time taken in grid search: {0: .2f}'.format(end - start))
- #OTHER CODE THAT COULD BE USEFUL
- #Showing correlations and distributions in the entire dataset
- sns.pairplot(dataset, diag_kws={'color':'darkgray','edgecolor':'white','lw':0.25},
- plot_kws={'color':'#00688b','lw':0,'alpha':0.5})
- plt.tight_layout
- plt.savefig('correlations.png')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement