Advertisement
Guest User

Untitled

a guest
May 19th, 2019
136
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.08 KB | None | 0 0
  1. ## Random Forest Regression
  2.  
  3. ## A. PREPARING THE DATA
  4. # Importing the libraries
  5. import numpy as np
  6. import matplotlib.pyplot as plt
  7. import pandas as pd
  8. import seaborn as sns
  9.  
  10. # Importing the dataset and defining the dependent and independent variables
  11. dataset = pd.read_excel('ET data.clean.xlsx')
  12. print(dataset.isnull().sum()) #shows missing values for each column
  13. print(dataset.dtypes) #shows the data type for each column
  14. dataset.info()
  15.  
  16. X = dataset.iloc[:, 7:].values #includes all Xs, numerical and categorical
  17. Xnumvar = list(dataset.dtypes[dataset.dtypes != "object"].index) #intermediate variable makes a list of the headers of non-object data types (numerical)
  18. Xnum = dataset[Xnumvar].head(24914) #includes numerical Xs only - no categorical
  19. Xnum = Xnum.iloc[:,7:].values #excludes the y from the dataset (first 6 columns)
  20.  
  21. #untag the following to select 1 specific y to analyse and hashtag the previous y
  22. #y = dataset.iloc[:, :5].values #all the dependent variables y (6 variables)
  23. #y = dataset.pop("RE IC per Capita (W/person)")
  24. y = dataset.pop("RE in TFEC (%)")
  25. #y = dataset.pop("Access to electricity (%)")
  26. #y = dataset.pop("Energy use per capita (kgoe)")
  27. #y = dataset.pop("GHG per capita (tCO2)")
  28. #y = dataset.pop("Energy Intensity (MJ/$2011 PPP GDP)")
  29. #y = dataset.pop("CO2 intensity (kg per kg of oil equivalent energy use)")
  30.  
  31. #Replacing nan with 0s (this is not ideal, I would like to keep the data gaps, rather than 0s)
  32. Xnum = np.nan_to_num(Xnum) #replaces all the NaN values in X with 0
  33. y = np.nan_to_num(y) #replaces all the NaN values in X with 0
  34.  
  35. ## B. SPLITTING THE DATA FOR TRAINING A MACHINE LEARNING MODEL
  36. # Splitting the dataset into the Training set and Test set
  37. from sklearn.model_selection import train_test_split
  38. X_train, X_test, y_train, y_test = train_test_split(Xnum, y, test_size = 0.3, random_state = 0)
  39.  
  40. ## C. RANDOM FOREST REGRESSION
  41. # Fitting Random Forest Regression to the dataset
  42. from sklearn.ensemble import RandomForestRegressor
  43. regressor = RandomForestRegressor(n_estimators = 100, random_state = 0, oob_score = True)
  44. regressor.fit(X_train, y_train)
  45.  
  46. #Testing the random forest regression's accuracy
  47. regressor.score(X_test,y_test)
  48.  
  49. #Viewing the importance of the Xs
  50. print(regressor.feature_importances_)
  51.  
  52. #Tuning the Hyperparameters of the Random Forest
  53. start = time.time()
  54.  
  55. param_dist = {'n_estimators':[15:500],
  56. 'max_depth': [2, 3, 4],
  57. 'bootstrap': [True, False],
  58. 'max_features': ['auto', 'sqrt', 'log2', None],
  59. 'criterion': ['gini', 'entropy']}
  60.  
  61. cv_rf = GridSearchCV(fit_rf, cv = 10,
  62. param_grid=param_dist,
  63. n_jobs = 3)
  64.  
  65. cv_rf.fit(training_set, class_set)
  66. print('Best Parameters using grid search: \n',
  67. cv_rf.best_params_)
  68. end = time.time()
  69. print('Time taken in grid search: {0: .2f}'.format(end - start))
  70.  
  71.  
  72. #OTHER CODE THAT COULD BE USEFUL
  73. #Showing correlations and distributions in the entire dataset
  74. sns.pairplot(dataset, diag_kws={'color':'darkgray','edgecolor':'white','lw':0.25},
  75. plot_kws={'color':'#00688b','lw':0,'alpha':0.5})
  76. plt.tight_layout
  77. plt.savefig('correlations.png')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement