Advertisement
Guest User

Untitled

a guest
Oct 22nd, 2019
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.57 KB | None | 0 0
  1. #import dependencies
  2.  
  3. get_ipython().run_line_magic('matplotlib', 'inline')
  4. import matplotlib.pyplot as plt
  5. import matplotlib.pyplot as plt; plt.rcdefaults()
  6. import numpy as np
  7. import seaborn as sns; sns.set(style="ticks", color_codes=True)
  8. import pandas as pd
  9. import os
  10. from sklearn.model_selection import train_test_split
  11. from sklearn.ensemble import RandomForestClassifier
  12. import warnings
  13. warnings.simplefilter('ignore')
  14.  
  15. # read in CSVs for all availables years
  16.  
  17. raw_2010_economic_data = pd.read_csv('Machine_Learning_Dataset_2010.csv')
  18. raw_2011_economic_data = pd.read_csv('Machine_Learning_Dataset_2011.csv')
  19. raw_2012_economic_data = pd.read_csv('Machine_Learning_Dataset_2012.csv')
  20. raw_2013_economic_data = pd.read_csv('Machine_Learning_Dataset_2013.csv')
  21. raw_2014_economic_data = pd.read_csv('Machine_Learning_Dataset_2014.csv')
  22. raw_2015_economic_data = pd.read_csv('Machine_Learning_Dataset_2015.csv')
  23. raw_2016_economic_data = pd.read_csv('Machine_Learning_Dataset_2016.csv')
  24. raw_2017_economic_data = pd.read_csv('Machine_Learning_Dataset_2017.csv')
  25.  
  26. # make a list of dataframes
  27.  
  28. df_list = [raw_2010_economic_data, raw_2011_economic_data,
  29. raw_2012_economic_data, raw_2013_economic_data,
  30. raw_2014_economic_data, raw_2015_economic_data,
  31. raw_2016_economic_data, raw_2017_economic_data]
  32.  
  33. # loop through dataframes and drop columns with null values
  34.  
  35. for df in df_list:
  36. df.dropna(axis='columns', inplace=True)
  37.  
  38. # loop through dataframes and drop any irrelevant or problematic columns
  39.  
  40. for df in df_list:
  41. df.drop(["COU",
  42. "Country",
  43. "Literacy rate, adult total (% of people ages 15 and above)",
  44. "GINI index (World Bank estimate)_x",
  45. "Mortality Causes",
  46. "Year",
  47. "Standard deviation/Mean of ladder by country-year"], axis=1, inplace=True)
  48.  
  49. # concatenate the dataframes into one
  50.  
  51. unified_df = pd.concat([raw_2010_economic_data, raw_2011_economic_data,
  52. raw_2012_economic_data, raw_2013_economic_data,
  53. raw_2014_economic_data, raw_2015_economic_data,
  54. raw_2016_economic_data, raw_2017_economic_data], ignore_index=True)
  55. unified_df.head()
  56.  
  57. # do another pass to drop null vaues
  58.  
  59. unified_df.dropna(axis='columns', inplace=True)
  60. unified_df.head()
  61.  
  62. # rename columns for legibility
  63.  
  64. renamed_unified_df = unified_df.rename(columns={"Life Ladder": "Happiness Index",
  65. "gini of household income reported in Gallup, by wp5-year": "Gini household income",
  66. "Population density (people per sq. km of land area)": "Population density",
  67. "Standard deviation of ladder by country-year": "Standard deviation of ladder",
  68. "Probability of dying at age 5-14 years (per 1,000 children age 5)": "Probability of dying at age 5-14"})
  69.  
  70. # random forests can't take floats, so I'm changing all data types to integer
  71.  
  72. rev_unified_df = renamed_unified_df.astype('int64')
  73.  
  74. # output this dataframe to CSV for use elsewhere
  75.  
  76. rev_unified_df.to_csv("econ_happy_data_allyears.csv", index=True, header=True)
  77.  
  78. # assign X (data) and y (target)
  79.  
  80. X = rev_unified_df.drop("Happiness Index", axis=1)
  81. y = rev_unified_df["Happiness Index"]
  82. print(X.shape, y.shape)
  83.  
  84. # establish variable to hold feature names (the names of the remaining columns)
  85.  
  86. feature_names = X.columns
  87.  
  88. # split data into training and testing
  89.  
  90. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
  91.  
  92. # create a random forest classifier
  93.  
  94. rf = RandomForestClassifier(n_estimators=200)
  95. rf = rf.fit(X_train, y_train)
  96. rf.score(X_test, y_test)
  97.  
  98. # calculate feature importance
  99.  
  100. importances = rf.feature_importances_
  101. importances
  102.  
  103. # sort the features by their importance
  104.  
  105. sorted(zip(rf.feature_importances_, feature_names), reverse=True)
  106.  
  107. # create feature importance df from which to sort and plot
  108.  
  109. fi_df = pd.DataFrame(list(zip(feature_names, importances)),
  110. columns =['Feature', 'Importance'])
  111. rev_fi_df = fi_df.sort_values(by=['Importance'], ascending=True)
  112. rev_fi_df
  113.  
  114. # plot feature importance
  115.  
  116. objects = rev_fi_df['Feature']
  117. y_pos = np.arange(len(objects))
  118. importance = rev_fi_df['Importance']
  119. colors = ['gray' if (x < max(importance)) else '#FC0280' for x in importance]
  120.  
  121. plt.barh(y_pos, importance, align='center', alpha=0.5, color=colors)
  122. plt.yticks(y_pos, objects)
  123. plt.xlabel('Feature Importance')
  124. plt.title('Comparative Happiness Feature Performance')
  125. plt.savefig('random_forests_fi.svg', bbox_inches='tight')
  126. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement