Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- from lightgbm import LGBMClassifier
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import roc_auc_score
- from sklearn.model_selection import cross_validate, StratifiedKFold
- train_raw = pd.read_csv('/kaggle/input/homecredit/application_train.csv')
- test_raw = pd.read_csv('/kaggle/input/homecredit/application_test.csv')
- info_df = pd.read_csv('/kaggle/input/homecredit-additional/HomeCredit_columns_description.csv',encoding = 'latin1')
- bureau = pd.read_csv('/kaggle/input/homecredit-additional/bureau.csv')
- credit_card_raw = pd.read_csv('/kaggle/input/full-homecredit/credit_card_balance.csv')
- installments_raw = pd.read_csv('/kaggle/input/full-homecredit/installments_payments.csv')
- posh_cash_raw = pd.read_csv('/kaggle/input/full-homecredit/POS_CASH_balance.csv')
- SK_ID = pd.DataFrame({'SK_ID_CURR': bureau['SK_ID_CURR'].unique()})
- def object_feature_engineering(data):
- new_data = data.copy()
- new_data['NAME_CONTRACT_TYPE'] = (new_data['NAME_CONTRACT_TYPE'] == 'Cash loans').astype(int)
- new_data['CODE_GENDER'].replace('XNA',np.nan, inplace=True)
- new_data['CODE_GENDER'] = (new_data['CODE_GENDER'] == 'M').astype(int)
- new_data['FLAG_OWN_CAR'] = (new_data['FLAG_OWN_CAR'] == 'Y').astype(int)
- new_data['FLAG_OWN_REALTY'] = (new_data['FLAG_OWN_REALTY'] == 'Y').astype(int)
- new_data['NAME_TYPE_SUITE'] = pd.Categorical(new_data['NAME_TYPE_SUITE']).codes
- new_data['NAME_INCOME_TYPE'] = pd.Categorical(new_data['NAME_INCOME_TYPE']).codes
- new_data['NAME_EDUCATION_TYPE'] = pd.Categorical(new_data['NAME_EDUCATION_TYPE']).codes
- new_data['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True)
- new_data['NAME_FAMILY_STATUS'] = pd.Categorical(new_data['NAME_FAMILY_STATUS']).codes
- new_data['NAME_HOUSING_TYPE'] = pd.Categorical(new_data['NAME_HOUSING_TYPE']).codes
- new_data['OCCUPATION_TYPE'] = pd.Categorical(new_data['OCCUPATION_TYPE']).codes
- new_data['WEEKDAY_APPR_PROCESS_START'] = pd.Categorical(new_data['WEEKDAY_APPR_PROCESS_START']).codes
- new_data['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace=True)
- new_data['ORGANIZATION_TYPE'] = pd.Categorical(new_data['ORGANIZATION_TYPE']).codes
- new_data['FONDKAPREMONT_MODE'] = pd.Categorical(new_data['FONDKAPREMONT_MODE']).codes
- new_data['HOUSETYPE_MODE'] = pd.Categorical(new_data['HOUSETYPE_MODE']).codes
- # new_data['WALLSMATERIAL_MODE'] = pd.Categorical(new_data['WALLSMATERIAL_MODE']).codes
- # new_data['EMERGENCYSTATE_MODE'] = (new_data['EMERGENCYSTATE_MODE'] == 'Yes').astype(int)
- new_data.drop(columns=['WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'], inplace=True)
- return new_data
- def category_engineer_bureau(df):
- X = df.copy()
- X['CREDIT_ACTIVE_BINARY'] = (X['CREDIT_ACTIVE'] != 'Closed').astype(int)
- X['CREDIT_ENDDATE_NORMAL'] = (X['DAYS_CREDIT_ENDDATE'] > 0).astype(int)
- X.drop(columns=['CREDIT_ACTIVE', 'DAYS_CREDIT_ENDDATE'])
- return X
- def custom_features(df):
- X = df.copy()
- X['ANNUITY_TO_INCOME_RATIO'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL']
- X['INCOME_TO_CREDIT_RATIO'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT']
- X['CAR_TO_BIRTH_RATIO'] = X['OWN_CAR_AGE'] / X['DAYS_BIRTH']
- X['CAR_TO_EMPLOY_RATIO'] = X['OWN_CAR_AGE'] / X['DAYS_EMPLOYED']
- X['CHILDREN_TO_FAMILY_MEMBERS_RATIO'] = X['CNT_CHILDREN'] / X['CNT_FAM_MEMBERS']
- X['CREDIT_TO_ANNUITY_RATIO'] = X['AMT_CREDIT'] / X['AMT_ANNUITY']
- X['CREDIT_TO_GOODS_RATIO'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']
- X['CREDIT_TO_INCOME_RATIO'] = X['AMT_CREDIT'] / X['AMT_INCOME_TOTAL']
- X['DAYS_EMPLOYED_TO_BIRTH_RATIO'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH']
- X['INCOME_PER_CHILD'] = X['AMT_INCOME_TOTAL'] / (1 + X['CNT_CHILDREN'])
- X['INCOME_PER_FAM_MEMBER'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS']
- X['PAYMENTS_RATE'] = X['AMT_ANNUITY'] / X['AMT_CREDIT']
- X['EXTERNAL_SOURCES_TOTAL'] = 2 * X['EXT_SOURCE_1'] + 3 * X['EXT_SOURCE_2'] + 4 * X['EXT_SOURCE_3']
- for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
- X['EXT_SOURCES_{}'.format(function_name)] = eval(f'np.{function_name}')(X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)
- return X
- def Build_And_Test_Model(data):
- model = LGBMClassifier(n_estimators=340, max_depth=4, num_leaves=30, objective='binary')
- X = data.drop(columns=['TARGET'])
- y = data['TARGET']
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)
- model.fit(X_train, y_train)
- print(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
- return model, (X_train, y_train), (X_test, y_test)
- def Feature_Importance(X_train, y_train, X_test, y_test):
- clf = DecisionTreeClassifier()
- clf.fit(X_train, y_train)
- print("R^2 on the train set:")
- print(clf.score(X_train, y_train))
- print("R^2 on the test set:")
- print(clf.score(X_test, y_test))
- feature_importances = pd.Series(clf.feature_importances_, index=X_train.columns.values)
- return feature_importances
- def fill_nans(data, logs=False):
- # fill_with_mean = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
- result = data.copy()
- result.dropna(subset=['SK_ID_CURR'], inplace=True)
- not_to_delete = ['OWN_CAR_AGE']
- result['EXT_SOURCE_1'].fillna(0.0, inplace=True)
- result['EXT_SOURCE_2'].fillna(0.0, inplace=True)
- result['EXT_SOURCE_3'].fillna(0.0, inplace=True)
- to_fill_nan = result.columns[result.isna().any()].tolist()
- for feature in to_fill_nan:
- count_of_nans = result[feature].isna().sum()
- if (result.dtypes[feature] == 'object' or result.dtypes[feature] == 'category'):
- continue
- if not (feature in not_to_delete) and (count_of_nans > int(0.35 * len(result[feature]))):
- result.drop(columns=[feature], inplace=True)
- continue
- mean = result[feature].mean()
- mode = result[feature].mode()[0]
- if int(mode) == mode:
- mean = mode
- if logs:
- print(f'feature name: {feature}; mean: {mean}; mode {mode};')
- # if feature in fill_with_zero:
- # result[feature].fillna(0, inplace=True)
- if not (mean == np.inf) and not (mean == -np.inf) and not np.isnan(mean):
- result[feature].fillna(mean, inplace=True)
- elif not (mode == np.inf) and not (mode == -np.inf) and not np.isnan(mode):
- result[feature].fillna(mode, inplace=True)
- else:
- result[feature].fillna(0, inplace=True)
- return result
- def Encode(data, logs=False):
- df = data.copy()
- encoder = OneHotEncoder(sparse_output=False)
- categorical_features = df.select_dtypes(include=['object', 'category'])
- if logs:
- print(f"Your categorical features: {categorical_features}")
- for feature in categorical_features:
- encoded_feature = encoder.fit_transform(df[feature].values.reshape(-1, 1))
- feature_categories = encoder.categories_[0]
- new_columns = [feature + '_' + str(category)
- .replace(' ', '_')
- .replace(':', '')
- .replace('/', '_or_')
- .replace(',', '')
- .replace('-', '_')for category in feature_categories]
- encoded_df = pd.DataFrame(encoded_feature, columns=new_columns)
- if logs:
- print(encoded_df.info())
- df = pd.concat([df, encoded_df], axis=1)
- df.drop(categorical_features.columns, axis=1, inplace=True)
- return df
- def merge_data():
- result = train_raw.copy()
- result.replace([np.inf, -np.inf], np.nan, inplace=True)
- result = result.merge( bureau_grouped
- , left_on=['SK_ID_CURR']
- , right_on=['SK_ID_CURR']
- , how='left'
- , validate='one_to_one'
- )
- result = result.merge( credit_card_grouped
- , left_on=['SK_ID_CURR']
- , right_on=['SK_ID_CURR']
- , how='left'
- , validate='one_to_one'
- )
- result = result.merge( installments_grouped
- , left_on=['SK_ID_CURR']
- , right_on=['SK_ID_CURR']
- , how='left'
- , validate='one_to_one'
- )
- result = result.merge( posh_cash_grouped
- , left_on=['SK_ID_CURR']
- , right_on=['SK_ID_CURR']
- , how='left'
- , validate='one_to_one'
- )
- result = result.merge( previous_application_grouped
- , left_on=['SK_ID_CURR']
- , right_on=['SK_ID_CURR']
- , how='left'
- , validate='one_to_one'
- )
- return result
- bureau_groupby = category_engineer_bureau(bureau).groupby(by=['SK_ID_CURR'])
- group_DAYS_CREDIT = bureau_groupby['DAYS_CREDIT'].agg('count').reset_index()
- group_CREDIT_TYPE = bureau_groupby['CREDIT_TYPE'].agg('nunique').reset_index()
- group_CREDIT_ACTIVE_BINARY = bureau_groupby['CREDIT_ACTIVE_BINARY'].agg('mean').reset_index()
- group_CREDIT_ENDDATE_NORMAL = bureau_groupby['CREDIT_ENDDATE_NORMAL'].agg('mean').reset_index()
- group_AMT_CREDIT_SUM_DEBT = bureau_groupby['AMT_CREDIT_SUM_DEBT'].agg('sum').reset_index()
- group_AMT_CREDIT_SUM = bureau_groupby['AMT_CREDIT_SUM'].agg('sum').reset_index()
- group_AMT_CREDIT_SUM_OVERDUE = bureau_groupby['AMT_CREDIT_SUM_OVERDUE'].agg('sum').reset_index()
- group_CNT_CREDIT_PROLONG = bureau_groupby['CNT_CREDIT_PROLONG'].agg('sum').reset_index()
- group_DAYS_CREDIT.rename(index=str, columns={'DAYS_CREDIT': 'COUNT_OF_PAST_LOANS'}, inplace=True)
- group_CREDIT_TYPE.rename(index=str, columns={'CREDIT_TYPE': 'COUNT_OF_LOAN_TYPES'}, inplace=True)
- group_CREDIT_ACTIVE_BINARY.rename(index=str, columns={'CREDIT_ACTIVE_BINARY': 'AVERAGE_CREDIT_ACTIVE'}, inplace=True)
- group_CREDIT_ENDDATE_NORMAL.rename(index=str, columns={'CREDIT_ENDDATE_NORMAL': 'AVERAGE_CREDIT_ENDDATE'}, inplace=True)
- group_AMT_CREDIT_SUM_DEBT.rename(index=str, columns={'AMT_CREDIT_SUM_DEBT': 'TOTAL_DEBT'}, inplace=True)
- group_AMT_CREDIT_SUM.rename(index=str, columns={'AMT_CREDIT_SUM': 'TOTAL_CREDIT'}, inplace=True)
- group_AMT_CREDIT_SUM_OVERDUE.rename(index=str, columns={'AMT_CREDIT_SUM_OVERDUE': 'TOTAL_OVERDUE'}, inplace=True)
- group_CNT_CREDIT_PROLONG.rename(index=str, columns={'CNT_CREDIT_PROLONG': 'TOTAL_PROLONGATION'}, inplace=True)
- bureau_grouped = SK_ID.copy()
- bureau_grouped = bureau_grouped.merge(group_DAYS_CREDIT, on=['SK_ID_CURR'], how='left')
- bureau_grouped = bureau_grouped.merge(group_CREDIT_TYPE, on=['SK_ID_CURR'], how='left')
- bureau_grouped = bureau_grouped.merge(group_CREDIT_ACTIVE_BINARY, on=['SK_ID_CURR'], how='left')
- bureau_grouped = bureau_grouped.merge(group_CREDIT_ENDDATE_NORMAL, on=['SK_ID_CURR'], how='left')
- bureau_grouped = bureau_grouped.merge(group_AMT_CREDIT_SUM_DEBT, on=['SK_ID_CURR'], how='left')
- bureau_grouped = bureau_grouped.merge(group_AMT_CREDIT_SUM, on=['SK_ID_CURR'], how='left')
- bureau_grouped = bureau_grouped.merge(group_AMT_CREDIT_SUM_OVERDUE, on=['SK_ID_CURR'], how='left')
- bureau_grouped = bureau_grouped.merge(group_CNT_CREDIT_PROLONG, on=['SK_ID_CURR'], how='left')
- bureau_grouped['AVERAGE_COUNT_LOANS'] = bureau_grouped['COUNT_OF_PAST_LOANS'] / bureau_grouped['COUNT_OF_LOAN_TYPES']
- bureau_grouped['DEBT_CREDIT_RATIO'] = bureau_grouped['TOTAL_DEBT'] / bureau_grouped['TOTAL_CREDIT']
- bureau_grouped['OVERDUE_DEBT_RATIO'] = bureau_grouped['TOTAL_OVERDUE'] / bureau_grouped['TOTAL_DEBT']
- credit_card_raw['CNT_INSTALMENTS'] = credit_card_raw.groupby(by=['SK_ID_CURR', 'SK_ID_PREV'])['CNT_INSTALMENT_MATURE_CUM'].agg('max').reset_index()['CNT_INSTALMENT_MATURE_CUM']
- credit_card_raw['MAX_LOADING'] = credit_card_raw.groupby(by=['SK_ID_CURR', 'SK_ID_PREV', 'AMT_CREDIT_LIMIT_ACTUAL']).apply(lambda x: x.AMT_BALANCE.max() / x.AMT_CREDIT_LIMIT_ACTUAL.max()).reset_index()[0]
- credit_card_gourpby = credit_card_raw.groupby(by=['SK_ID_CURR'])
- group_SK_ID_PREV = credit_card_gourpby['SK_ID_PREV'].agg('nunique').reset_index()
- group_CNT_INSTALMENTS = credit_card_gourpby['CNT_INSTALMENTS'].sum().reset_index()
- group_MAX_LOADING = credit_card_gourpby['MAX_LOADING'].agg('mean').reset_index()
- group_SK_DPD = credit_card_gourpby['SK_DPD'].agg('mean').reset_index()
- group_AMT_DRAWINGS_ATM_CURRENT = credit_card_gourpby['AMT_DRAWINGS_ATM_CURRENT'].agg('sum').reset_index()
- group_AMT_DRAWINGS_CURRENT = credit_card_gourpby['AMT_DRAWINGS_CURRENT'].agg('sum').reset_index()
- group_SK_ID_PREV.rename(index=str, columns={'SK_ID_PREV': 'COUNT_OF_LOANS'}, inplace=True)
- group_CNT_INSTALMENTS.rename(index=str, columns={'CNT_INSTALMENTS': 'TOTAL_INSTALMENTS'}, inplace=True)
- group_MAX_LOADING.rename(index=str, columns={'MAX_LOADING': 'AVG_LOADING'}, inplace=True)
- group_SK_DPD.rename(index=str, columns={'SK_DPD': 'AVG_DPD'}, inplace=True)
- group_AMT_DRAWINGS_ATM_CURRENT.rename(index=str, columns={'AMT_DRAWINGS_ATM_CURRENT': 'TOTAL_DRAWINGS_ATM'}, inplace=True)
- group_AMT_DRAWINGS_CURRENT.rename(index=str, columns={'AMT_DRAWINGS_CURRENT': 'TOTAL_DRAWINGS'}, inplace=True)
- credit_card_grouped = pd.DataFrame({'SK_ID_CURR': credit_card_raw['SK_ID_CURR'].unique()})
- credit_card_grouped = credit_card_grouped.merge(group_SK_ID_PREV, on=['SK_ID_CURR'], how='left')
- credit_card_grouped = credit_card_grouped.merge(group_CNT_INSTALMENTS, on=['SK_ID_CURR'], how='left')
- credit_card_grouped = credit_card_grouped.merge(group_MAX_LOADING, on=['SK_ID_CURR'], how='left')
- credit_card_grouped = credit_card_grouped.merge(group_SK_DPD, on=['SK_ID_CURR'], how='left')
- credit_card_grouped = credit_card_grouped.merge(group_AMT_DRAWINGS_ATM_CURRENT, on=['SK_ID_CURR'], how='left')
- credit_card_grouped = credit_card_grouped.merge(group_AMT_DRAWINGS_CURRENT, on=['SK_ID_CURR'], how='left')
- credit_card_grouped['INSTALLMENTS_PER_LOAN'] = credit_card_grouped['TOTAL_INSTALMENTS'] / credit_card_grouped['COUNT_OF_LOANS']
- credit_card_grouped['CASH_RATIO'] = credit_card_grouped['TOTAL_DRAWINGS_ATM'] / credit_card_grouped['TOTAL_DRAWINGS']
- previous_application_raw = pd.read_csv('/kaggle/input/full-homecredit/previous_application.csv')
- agg_col_names = []
- feature_names = [ 'AMT_ANNUITY'
- , 'AMT_APPLICATION'
- , 'AMT_CREDIT'
- , 'AMT_DOWN_PAYMENT'
- , 'AMT_GOODS_PRICE'
- # , 'RATE_DOWN_PAYMENT' слишком много пропусков
- # , 'RATE_INTEREST_PRIMARY' слишком много пропусков
- , 'DAYS_DECISION'
- , 'SELLERPLACE_AREA'
- , 'CNT_PAYMENT'
- ]
- for agg_fun in ['mean', 'min', 'max', 'var']:
- for column in [ 'AMT_ANNUITY'
- , 'AMT_APPLICATION'
- , 'AMT_CREDIT'
- , 'AMT_DOWN_PAYMENT'
- , 'AMT_GOODS_PRICE'
- # , 'RATE_DOWN_PAYMENT' слишком много пропусков
- # , 'RATE_INTEREST_PRIMARY' слишком много пропусков
- , 'DAYS_DECISION'
- , 'SELLERPLACE_AREA'
- , 'CNT_PAYMENT'
- ]:
- agg_col_names.append((column, agg_fun))
- previous_application_grouped = pd.DataFrame({'SK_ID_CURR': previous_application_raw['SK_ID_CURR'].unique()})
- previous_application_groupby = previous_application_raw[['SK_ID_CURR'] + feature_names].groupby('SK_ID_CURR')
- print('CHECKPOINT')
- for column_name, agg in agg_col_names:
- print(f'CHECKPOINT-{column_name}-{agg}')
- new_col_name = '{}_{}'.format(column_name, agg)
- cur_group = previous_application_groupby.agg(agg).reset_index().rename( index=str
- , columns={column_name: new_col_name})[['SK_ID_CURR', new_col_name]]
- previous_application_grouped.merge(cur_group, on=['SK_ID_CURR'], how='left')
- agg_col_names = []
- for agg_fun in ['min', 'max', 'sum', 'mean']:
- for column in [ 'AMT_INSTALMENT'
- , 'AMT_PAYMENT'
- , 'DAYS_ENTRY_PAYMENT'
- , 'DAYS_INSTALMENT'
- , 'NUM_INSTALMENT_NUMBER'
- , 'NUM_INSTALMENT_VERSION'
- ]:
- agg_col_names.append((column, agg_fun))
- installments_grouped = pd.DataFrame({'SK_ID_CURR': installments_raw['SK_ID_CURR'].unique()})
- installments_groupby = installments_raw.groupby('SK_ID_CURR')
- for column_name, agg in agg_col_names:
- new_col_name = '{}_{}'.format(column_name, agg)
- cur_group = installments_groupby.agg(agg).reset_index().rename( index=str
- , columns={column_name: new_col_name})[['SK_ID_CURR', new_col_name]]
- installments_grouped.merge(cur_group, on=['SK_ID_CURR'], how='left')
- agg_col_names = []
- feature_names = [ 'MONTHS_BALANCE'
- , 'CNT_INSTALMENT'
- , 'CNT_INSTALMENT_FUTURE'
- , 'SK_DPD'
- , 'SK_DPD_DEF'
- ]
- for agg_fun in ['mean', 'min', 'max', 'var']:
- for column in [ 'MONTHS_BALANCE'
- , 'CNT_INSTALMENT'
- , 'CNT_INSTALMENT_FUTURE'
- , 'SK_DPD'
- , 'SK_DPD_DEF'
- ]:
- agg_col_names.append((column, agg_fun))
- posh_cash_grouped = pd.DataFrame({'SK_ID_CURR': posh_cash_raw['SK_ID_CURR'].unique()})
- posh_cash_groupby = posh_cash_raw[['SK_ID_CURR'] + feature_names].groupby('SK_ID_CURR')
- print('CHECKPOINT')
- for column_name, agg in agg_col_names:
- print(f'CHECKPOINT-{column_name}-{agg}')
- new_col_name = '{}_{}'.format(column_name, agg)
- cur_group = posh_cash_groupby.agg(agg).reset_index().rename( index=str
- , columns={column_name: new_col_name})[['SK_ID_CURR', new_col_name]]
- posh_cash_grouped.merge(cur_group, on=['SK_ID_CURR'], how='left')
- train_valid = merge_data()
- # train_valid = object_feature_engineering(train_valid)
- train_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
- train_valid = fill_nans(train_valid)
- train_valid = Encode(train_valid, logs=False)
- train_valid = custom_features(train_valid)
- train_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
- train_valid = fill_nans(train_valid)
- model_valid, (X_train, y_train), (X_test, y_test) = Build_And_Test_Model(train_valid)
- feature_importance = Feature_Importance(X_train, y_train, X_test, y_test)
- feature_importance = feature_importance.sort_values(ascending=False)
- feature_importance[feature_importance > 0].index
- test_features = feature_importance[feature_importance > 0].index.tolist()
- train_valied_with_feature_importance = train_valid[test_features + ['TARGET']]
- _, (X_train, y_train), (X_test, y_test) = Build_And_Test_Model(train_valied_with_feature_importance)
- cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
- X = train_valied_with_feature_importance.drop(columns=['TARGET'])
- y = train_valied_with_feature_importance['TARGET']
- model = LGBMClassifier(n_estimators=340, max_depth=4, num_leaves=30, objective='binary')
- scores = cross_validate(model, X, y, cv=cv, scoring='roc_auc', return_estimator=True)
- mean_score = scores['test_score'].mean()
- std_score = scores['test_score'].std()
- print(f'ROC_AUC : {mean_score} +/- {std_score}')
- best_estimator = scores['estimator'][scores['test_score'].argmax()]
- print(roc_auc_score(y_test, best_estimator.predict_proba(X_test)[:, 1]))
Advertisement
Add Comment
Please, Sign In to add comment