Untitled

## Random Forest Regression

## A. PREPARING THE DATA
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Importing the dataset and defining the dependent and independent variables
dataset = pd.read_excel('ET data.clean.xlsx')
print(dataset.isnull().sum()) #shows missing values for each column
print(dataset.dtypes) #shows the data type for each column
dataset.info()

X = dataset.iloc[:, 7:].values #includes all Xs, numerical and categorical
Xnumvar = list(dataset.dtypes[dataset.dtypes != "object"].index) #intermediate variable makes a list of the headers of non-object data types (numerical)
Xnum = dataset[Xnumvar].head(24914) #includes numerical Xs only - no categorical
Xnum = Xnum.iloc[:,7:].values #excludes the y from the dataset (first 6 columns)

#untag the following to select 1 specific y to analyse and hashtag the previous y
#y = dataset.iloc[:, :5].values #all the dependent variables y (6 variables)
#y = dataset.pop("RE IC per Capita (W/person)")
y = dataset.pop("RE in TFEC (%)")
#y = dataset.pop("Access to electricity (%)")
#y = dataset.pop("Energy use per capita (kgoe)")
#y = dataset.pop("GHG per capita (tCO2)")
#y = dataset.pop("Energy Intensity (MJ/$2011 PPP GDP)")
#y = dataset.pop("CO2 intensity (kg per kg of oil equivalent energy use)")

#Replacing nan with 0s (this is not ideal, I would like to keep the data gaps, rather than 0s)
Xnum = np.nan_to_num(Xnum) #replaces all the NaN values in X with 0
y = np.nan_to_num(y) #replaces all the NaN values in X with 0

## B. SPLITTING THE DATA FOR TRAINING A MACHINE LEARNING MODEL
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xnum, y, test_size = 0.3, random_state = 0)

## C. RANDOM FOREST REGRESSION
# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0, oob_score = True)
regressor.fit(X_train, y_train)

#Testing the random forest regression's accuracy
regressor.score(X_test,y_test)

#Viewing the importance of the Xs
print(regressor.feature_importances_)

#Tuning the Hyperparameters of the Random Forest
start = time.time()

param_dist = {'n_estimators':[15:500],
              'max_depth': [2, 3, 4],
              'bootstrap': [True, False],
              'max_features': ['auto', 'sqrt', 'log2', None],
              'criterion': ['gini', 'entropy']}

cv_rf = GridSearchCV(fit_rf, cv = 10,
                     param_grid=param_dist,
                     n_jobs = 3)

cv_rf.fit(training_set, class_set)
print('Best Parameters using grid search: \n',
      cv_rf.best_params_)
end = time.time()
print('Time taken in grid search: {0: .2f}'.format(end - start))


#OTHER CODE THAT COULD BE USEFUL
#Showing correlations and distributions in the entire dataset
sns.pairplot(dataset, diag_kws={'color':'darkgray','edgecolor':'white','lw':0.25},
             plot_kws={'color':'#00688b','lw':0,'alpha':0.5})
plt.tight_layout
plt.savefig('correlations.png')