Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #import dependencies
- get_ipython().run_line_magic('matplotlib', 'inline')
- import matplotlib.pyplot as plt
- import matplotlib.pyplot as plt; plt.rcdefaults()
- import numpy as np
- import seaborn as sns; sns.set(style="ticks", color_codes=True)
- import pandas as pd
- import os
- from sklearn.model_selection import train_test_split
- from sklearn.ensemble import RandomForestClassifier
- import warnings
- warnings.simplefilter('ignore')
- # read in CSVs for all availables years
- raw_2010_economic_data = pd.read_csv('Machine_Learning_Dataset_2010.csv')
- raw_2011_economic_data = pd.read_csv('Machine_Learning_Dataset_2011.csv')
- raw_2012_economic_data = pd.read_csv('Machine_Learning_Dataset_2012.csv')
- raw_2013_economic_data = pd.read_csv('Machine_Learning_Dataset_2013.csv')
- raw_2014_economic_data = pd.read_csv('Machine_Learning_Dataset_2014.csv')
- raw_2015_economic_data = pd.read_csv('Machine_Learning_Dataset_2015.csv')
- raw_2016_economic_data = pd.read_csv('Machine_Learning_Dataset_2016.csv')
- raw_2017_economic_data = pd.read_csv('Machine_Learning_Dataset_2017.csv')
- # make a list of dataframes
- df_list = [raw_2010_economic_data, raw_2011_economic_data,
- raw_2012_economic_data, raw_2013_economic_data,
- raw_2014_economic_data, raw_2015_economic_data,
- raw_2016_economic_data, raw_2017_economic_data]
- # loop through dataframes and drop columns with null values
- for df in df_list:
- df.dropna(axis='columns', inplace=True)
- # loop through dataframes and drop any irrelevant or problematic columns
- for df in df_list:
- df.drop(["COU",
- "Country",
- "Literacy rate, adult total (% of people ages 15 and above)",
- "GINI index (World Bank estimate)_x",
- "Mortality Causes",
- "Year",
- "Standard deviation/Mean of ladder by country-year"], axis=1, inplace=True)
- # concatenate the dataframes into one
- unified_df = pd.concat([raw_2010_economic_data, raw_2011_economic_data,
- raw_2012_economic_data, raw_2013_economic_data,
- raw_2014_economic_data, raw_2015_economic_data,
- raw_2016_economic_data, raw_2017_economic_data], ignore_index=True)
- unified_df.head()
- # do another pass to drop null vaues
- unified_df.dropna(axis='columns', inplace=True)
- unified_df.head()
- # rename columns for legibility
- renamed_unified_df = unified_df.rename(columns={"Life Ladder": "Happiness Index",
- "gini of household income reported in Gallup, by wp5-year": "Gini household income",
- "Population density (people per sq. km of land area)": "Population density",
- "Standard deviation of ladder by country-year": "Standard deviation of ladder",
- "Probability of dying at age 5-14 years (per 1,000 children age 5)": "Probability of dying at age 5-14"})
- # random forests can't take floats, so I'm changing all data types to integer
- rev_unified_df = renamed_unified_df.astype('int64')
- # output this dataframe to CSV for use elsewhere
- rev_unified_df.to_csv("econ_happy_data_allyears.csv", index=True, header=True)
- # assign X (data) and y (target)
- X = rev_unified_df.drop("Happiness Index", axis=1)
- y = rev_unified_df["Happiness Index"]
- print(X.shape, y.shape)
- # establish variable to hold feature names (the names of the remaining columns)
- feature_names = X.columns
- # split data into training and testing
- X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
- # create a random forest classifier
- rf = RandomForestClassifier(n_estimators=200)
- rf = rf.fit(X_train, y_train)
- rf.score(X_test, y_test)
- # calculate feature importance
- importances = rf.feature_importances_
- importances
- # sort the features by their importance
- sorted(zip(rf.feature_importances_, feature_names), reverse=True)
- # create feature importance df from which to sort and plot
- fi_df = pd.DataFrame(list(zip(feature_names, importances)),
- columns =['Feature', 'Importance'])
- rev_fi_df = fi_df.sort_values(by=['Importance'], ascending=True)
- rev_fi_df
- # plot feature importance
- objects = rev_fi_df['Feature']
- y_pos = np.arange(len(objects))
- importance = rev_fi_df['Importance']
- colors = ['gray' if (x < max(importance)) else '#FC0280' for x in importance]
- plt.barh(y_pos, importance, align='center', alpha=0.5, color=colors)
- plt.yticks(y_pos, objects)
- plt.xlabel('Feature Importance')
- plt.title('Comparative Happiness Feature Performance')
- plt.savefig('random_forests_fi.svg', bbox_inches='tight')
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement