Advertisement
Guest User

Untitled

a guest
Jul 21st, 2019
209
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.46 KB | None | 0 0
  1. ##import packages
  2. import pandas as pd
  3. from sklearn import preprocessing
  4. import sklearn.model_selection as ms
  5. from sklearn import linear_model
  6. import sklearn.metrics as sklm
  7. import numpy as np
  8. import numpy.random as nr
  9. import matplotlib.pyplot as plt
  10. import seaborn as sns
  11. import scipy.stats as ss
  12. import math
  13. %matplotlib inline
  14.  
  15. ##load datasets with pandas
  16. train_values = pd.read_csv('train_values.csv')
  17. train_labels = pd.read_csv('train_labels.csv')
  18. test_values = pd.read_csv('test_values.csv')
  19.  
  20. ##viewing value data type
  21. train_values.dtypes
  22.  
  23. ##view columns with null values and count them
  24. print(train_values.isnull().sum())
  25. train_values.shape
  26.  
  27. ##drop columns with too many missing values
  28. train_values.drop(['bank_interest_rate'], axis = 1, inplace = True)
  29. train_values.drop(['mm_interest_rate'], axis = 1, inplace = True)
  30. train_values.drop(['mfi_interest_rate'], axis = 1, inplace = True)
  31. train_values.drop(['other_fsp_interest_rate'], axis = 1, inplace = True)
  32. train_values.shape
  33.  
  34. ##drop rows with missing values
  35. cols = ['education_level', 'share_hh_income_provided']
  36. for column in cols:
  37. train_values.loc[train_values[column].isnull()] = np.nan
  38. train_values.dropna(axis = 0, inplace = True)
  39. train_values.shape
  40.  
  41. ##checking if it worked
  42. print(train_values.isnull().sum())
  43.  
  44. ##merging data into one frame
  45. training_set = pd.merge(train_values, train_labels, on = 'row_id')
  46. training_set.head(1)
  47.  
  48. ##encoding categorical features into binary numeric data
  49. def encode_string(cat_feature):
  50. enc = preprocessing.LabelEncoder()
  51. enc.fit(cat_feature)
  52. enc_cat_feature = enc.transform(cat_feature)
  53. ohe = preprocessing.OneHotEncoder()
  54. encoded = ohe.fit(enc_cat_feature.reshape(-1,1))
  55. return encoded.transform(enc_cat_feature.reshape(-1,1)).toarray()
  56.  
  57. categorical_columns = ['religion', 'relationship_to_hh_head',
  58. 'employment_category_last_year', 'employment_type_last_year']
  59. Features = encode_string(training_set['country'])
  60. for col in categorical_columns:
  61. temp = encode_string(training_set[col])
  62. Features = np.concatenate([Features, temp], axis = 1)
  63. print(Features.shape)
  64.  
  65. ##adding on the numeric columns
  66. Features = np.concatenate([Features, np.array(training_set[['is_urban', 'age', 'female', 'married',
  67. 'education_level', 'literacy', 'can_add',
  68. 'can_divide', 'can_calc_percents', 'can_calc_compounding',
  69. 'employed_last_year',
  70. 'share_hh_income_provided',
  71. 'income_ag_livestock_last_year', 'income_friends_family_last_year',
  72. 'income_government_last_year', 'income_own_business_last_year',
  73. 'income_private_sector_last_year', 'income_public_sector_last_year',
  74. 'num_times_borrowed_last_year', 'borrowing_recency', 'formal_savings',
  75. 'informal_savings', 'cash_property_savings', 'has_insurance',
  76. 'has_investment', 'num_shocks_last_year',
  77. 'avg_shock_strength_last_year', 'borrowed_for_emergency_last_year',
  78. 'borrowed_for_daily_expenses_last_year',
  79. 'borrowed_for_home_or_biz_last_year', 'phone_technology', 'can_call',
  80. 'can_text', 'can_use_internet', 'can_make_transaction',
  81. 'phone_ownership', 'advanced_phone_use', 'reg_bank_acct', 'reg_mm_acct',
  82. 'reg_formal_nbfi_account', 'financially_included', 'active_bank_user',
  83. 'active_mm_user', 'active_formal_nbfi_user',
  84. 'active_informal_nbfi_user', 'nonreg_active_mm_user',
  85. 'num_formal_institutions_last_year',
  86. 'num_informal_institutions_last_year',
  87. 'num_financial_activities_last_year']])], axis = 1)
  88. Features.shape
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement