Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- from sklearn.model_selection import train_test_split, RandomizedSearchCV
- from sklearn import metrics
- from sklearn.pipeline import Pipeline
- from sklearn.impute import SimpleImputer
- from category_encoders import TargetEncoder
- from sklearn.compose import ColumnTransformer
- from catboost import CatBoostRegressor
- model_frame = pd.read_csv('data/train.csv')
- test_frame = pd.read_csv('data/test.csv')
- def preprocess(frame):
- frame = drop_duplicates(frame)
- frame = drop_unwanted_columns(frame)
- frame = treat_year_of_record(frame)
- frame = treat_housing_situation(frame)
- frame = treat_work_experience(frame)
- frame = treat_university_degree(frame)
- frame = treat_gender(frame)
- frame = treat_additional_income(frame)
- frame = treat_NaN_values(frame)
- return frame
- def drop_duplicates(frame):
- frame.sort_values('Instance', inplace = True)
- frame.drop_duplicates('Instance', keep = 'first', inplace = True)
- return frame
- def drop_unwanted_columns(frame):
- frame = frame.drop(columns = ['Instance','Wears Glasses','Hair Color'])
- return frame
- def treat_year_of_record(frame):
- frame['Year of Record'] = frame['Year of Record'].fillna(method='bfill')
- return frame
- def treat_gender(frame):
- frame['Gender'] = frame['Gender'].replace({'f': 'female'})
- return frame
- def treat_housing_situation(frame):
- frame['Housing Situation'] = frame['Housing Situation'].replace({'0': 'none', 0: 'none', 'nA': 'none'})
- frame = frame.astype({'Housing Situation': str})
- return frame
- def treat_work_experience(frame):
- frame['Work Experience in Current Job [years]'] = pd.to_numeric(frame['Work Experience in Current Job [years]'], errors='coerce')
- return frame
- def treat_university_degree(frame):
- frame.loc[frame['University Degree'] == '0', 'University Degree'] = 'No'
- return frame
- def treat_additional_income(frame):
- frame['Yearly Income in addition to Salary (e.g. Rental Income)'] = frame['Yearly Income in addition to Salary (e.g. Rental Income)'].map(lambda x: float(x.rstrip('EUR')))
- return frame
- def treat_NaN_values(frame):
- frame = frame.replace('#NUM!', np.NaN)
- frame = frame.fillna(method = 'bfill')
- return frame
- def scale_income(frame):
- frame['Total Yearly Income [EUR]'] = np.log(frame['Total Yearly Income [EUR]'])
- return frame['Total Yearly Income [EUR]'].values
- model_frame = preprocess(model_frame)
- test_frame = preprocess(test_frame)
- model_frame = pd.get_dummies(model_frame, drop_first=False)
- income = scale_income(model_frame)
- model_frame = model_frame.drop(columns=['Total Yearly Income [EUR]'])
- test_frame = pd.get_dummies(test_frame, drop_first=False)
- X_train, X_test, Y_train, Y_test = train_test_split(model_frame, income, train_size = 0.8, test_size = 0.2)
- rsv = RandomizedSearchCV(estimator = CatBoostRegressor(),
- param_distributions = { 'learning_rate': (0.01, 0.02, 0.03), 'n_estimators': (200, 400), 'max_depth': (4, 8, 12) },
- verbose=1, n_iter=10, cv = 5, scoring='neg_mean_absolute_error')
- rsv.fit(X_train, Y_train)
- y_predict = np.exp(gcsv.predict(test_frame))
- print(metrics.mean_absolute_error(np.exp(Y_test), np.exp(gcsv.predict(X_test))))
- # Instances saved to separate file for ease of access
- instances = pd.read_csv('data/instances.csv')['Instance'].values
- f = open("data/submission.csv", "w")
- # Write to File
- f.write("Instance,Total Yearly Income [EUR]\n")
- for i in range(len(y_predict)):
- f.write(str(instances[i]) + "," + str(y_predict[i]) + "\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement