Advertisement
Guest User

Untitled

a guest
Nov 18th, 2019
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.55 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.model_selection import train_test_split, GridSearchCV
  4. from sklearn import metrics
  5. from sklearn.pipeline import Pipeline
  6. from sklearn.impute import SimpleImputer
  7. from category_encoders import TargetEncoder
  8. from sklearn.compose import ColumnTransformer
  9. from catboost import CatBoostRegressor
  10.  
  11. model_frame = pd.read_csv('data/train.csv')
  12. test_frame = pd.read_csv('data/test.csv')
  13.  
  14. def preprocess(frame):
  15. frame = drop_duplicates(frame)
  16. frame = drop_unwanted_columns(frame)
  17. frame = treat_year_of_record(frame)
  18. frame = treat_housing_situation(frame)
  19. frame = treat_work_experience(frame)
  20. frame = treat_university_degree(frame)
  21. frame = treat_gender(frame)
  22. frame = treat_additional_income(frame)
  23. frame = treat_NaN_values(frame)
  24. return frame
  25.  
  26. def drop_duplicates(frame):
  27. frame.sort_values('Instance', inplace = True)
  28. frame.drop_duplicates('Instance', keep = 'first', inplace = True)
  29. return frame
  30.  
  31. def drop_unwanted_columns(frame):
  32. frame = frame.drop(columns = ['Instance','Wears Glasses','Hair Color'])
  33. return frame
  34.  
  35. def treat_year_of_record(frame):
  36. frame['Year of Record'] = frame['Year of Record'].fillna(method='bfill')
  37. return frame
  38.  
  39. def treat_gender(frame):
  40. frame['Gender'] = frame['Gender'].replace({'f': 'female'})
  41. return frame
  42.  
  43. def treat_housing_situation(frame):
  44. frame['Housing Situation'] = frame['Housing Situation'].replace({'0': 'none', 0: 'none', 'nA': 'none'})
  45. frame = frame.astype({'Housing Situation': str})
  46. return frame
  47.  
  48. def treat_work_experience(frame):
  49. frame['Work Experience in Current Job [years]'] = pd.to_numeric(frame['Work Experience in Current Job [years]'], errors='coerce')
  50. return frame
  51.  
  52. def treat_university_degree(frame):
  53. frame.loc[frame['University Degree'] == '0', 'University Degree'] = 'No'
  54. return frame
  55.  
  56. def treat_additional_income(frame):
  57. frame['Yearly Income in addition to Salary (e.g. Rental Income)'] = frame['Yearly Income in addition to Salary (e.g. Rental Income)'].map(lambda x: float(x.rstrip('EUR')))
  58. return frame
  59.  
  60. def treat_NaN_values(frame):
  61. frame = frame.replace('#NUM!', np.NaN)
  62. frame = frame.fillna(method = 'bfill')
  63. return frame
  64.  
  65. def scale_income(frame):
  66. frame['Total Yearly Income [EUR]'] = np.log(frame['Total Yearly Income [EUR]'])
  67. return frame['Total Yearly Income [EUR]'].values
  68.  
  69. model_frame = preprocess(model_frame)
  70. test_frame = preprocess(test_frame)
  71.  
  72. model_frame = pd.get_dummies(model_frame, drop_first=False)
  73. income = scale_income(model_frame)
  74.  
  75. model_frame = model_frame.drop(columns=['Total Yearly Income [EUR]'])
  76.  
  77. test_frame = pd.get_dummies(test_frame, drop_first=False)
  78.  
  79. X_train, X_test, Y_train, Y_test = train_test_split(model_frame, income, train_size = 0.8, test_size = 0.2)
  80.  
  81. gcsv = GridSearchCV(estimator = CatBoostRegressor(learning_rate = 0.01),
  82. param_grid = { 'n_estimators': (200, 400), 'max_depth': (4, 6, 8) },
  83. verbose=1, n_jobs= 8, cv = 5, scoring='neg_mean_absolute_error')
  84.  
  85. gcsv.fit(X_train, Y_train)
  86.  
  87. y_predict = np.exp(gcsv.predict(test_frame))
  88. print(metrics.mean_absolute_error(np.exp(Y_test), np.exp(regr.predict(X_test))))
  89. # Instances saved to separate file for ease of access
  90. instances = pd.read_csv('data/instances.csv')['Instance'].values
  91. f = open("data/submission.csv", "w")
  92.  
  93. # Write to File
  94. f.write("Instance,Total Yearly Income [EUR]\n")
  95.  
  96. for i in range(len(y_predict)):
  97. f.write(str(instances[i]) + "," + str(y_predict[i]) + "\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement