Al3XS0n

Kaggle Home credit

Dec 21st, 2023 (edited)
890
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 20.18 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. from lightgbm import LGBMClassifier
  4. from sklearn.tree import DecisionTreeClassifier
  5. from sklearn.model_selection import train_test_split
  6. from sklearn.metrics import roc_auc_score
  7. from sklearn.model_selection import cross_validate, StratifiedKFold
  8.  
  9. train_raw = pd.read_csv('/kaggle/input/homecredit/application_train.csv')
  10. test_raw = pd.read_csv('/kaggle/input/homecredit/application_test.csv')
  11. info_df = pd.read_csv('/kaggle/input/homecredit-additional/HomeCredit_columns_description.csv',encoding = 'latin1')
  12. bureau = pd.read_csv('/kaggle/input/homecredit-additional/bureau.csv')
  13. credit_card_raw = pd.read_csv('/kaggle/input/full-homecredit/credit_card_balance.csv')
  14. installments_raw = pd.read_csv('/kaggle/input/full-homecredit/installments_payments.csv')
  15. posh_cash_raw = pd.read_csv('/kaggle/input/full-homecredit/POS_CASH_balance.csv')
  16.  
  17. SK_ID = pd.DataFrame({'SK_ID_CURR': bureau['SK_ID_CURR'].unique()})
  18.  
  19. def object_feature_engineering(data):
  20.     new_data = data.copy()
  21.    
  22.     new_data['NAME_CONTRACT_TYPE']         = (new_data['NAME_CONTRACT_TYPE'] == 'Cash loans').astype(int)
  23.     new_data['CODE_GENDER'].replace('XNA',np.nan, inplace=True)
  24.     new_data['CODE_GENDER']                = (new_data['CODE_GENDER'] == 'M').astype(int)
  25.     new_data['FLAG_OWN_CAR']               = (new_data['FLAG_OWN_CAR'] == 'Y').astype(int)
  26.     new_data['FLAG_OWN_REALTY']            = (new_data['FLAG_OWN_REALTY'] == 'Y').astype(int)
  27.     new_data['NAME_TYPE_SUITE']            = pd.Categorical(new_data['NAME_TYPE_SUITE']).codes
  28.     new_data['NAME_INCOME_TYPE']           = pd.Categorical(new_data['NAME_INCOME_TYPE']).codes
  29.     new_data['NAME_EDUCATION_TYPE']        = pd.Categorical(new_data['NAME_EDUCATION_TYPE']).codes
  30.     new_data['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True)
  31.     new_data['NAME_FAMILY_STATUS']         = pd.Categorical(new_data['NAME_FAMILY_STATUS']).codes
  32.     new_data['NAME_HOUSING_TYPE']          = pd.Categorical(new_data['NAME_HOUSING_TYPE']).codes
  33.     new_data['OCCUPATION_TYPE']            = pd.Categorical(new_data['OCCUPATION_TYPE']).codes
  34.     new_data['WEEKDAY_APPR_PROCESS_START'] = pd.Categorical(new_data['WEEKDAY_APPR_PROCESS_START']).codes
  35.     new_data['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace=True)
  36.     new_data['ORGANIZATION_TYPE']          = pd.Categorical(new_data['ORGANIZATION_TYPE']).codes
  37.     new_data['FONDKAPREMONT_MODE']         = pd.Categorical(new_data['FONDKAPREMONT_MODE']).codes
  38.     new_data['HOUSETYPE_MODE']             = pd.Categorical(new_data['HOUSETYPE_MODE']).codes
  39. #     new_data['WALLSMATERIAL_MODE']         = pd.Categorical(new_data['WALLSMATERIAL_MODE']).codes
  40. #     new_data['EMERGENCYSTATE_MODE']        = (new_data['EMERGENCYSTATE_MODE'] == 'Yes').astype(int)
  41.     new_data.drop(columns=['WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'], inplace=True)
  42.     return new_data
  43.  
  44. def category_engineer_bureau(df):
  45.     X = df.copy()
  46.     X['CREDIT_ACTIVE_BINARY'] = (X['CREDIT_ACTIVE'] != 'Closed').astype(int)
  47.     X['CREDIT_ENDDATE_NORMAL'] = (X['DAYS_CREDIT_ENDDATE'] > 0).astype(int)
  48.     X.drop(columns=['CREDIT_ACTIVE', 'DAYS_CREDIT_ENDDATE'])
  49.     return X
  50.  
  51. def custom_features(df):
  52.     X = df.copy()
  53.     X['ANNUITY_TO_INCOME_RATIO'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL']
  54.     X['INCOME_TO_CREDIT_RATIO'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT']
  55.     X['CAR_TO_BIRTH_RATIO'] = X['OWN_CAR_AGE'] / X['DAYS_BIRTH']
  56.     X['CAR_TO_EMPLOY_RATIO'] = X['OWN_CAR_AGE'] / X['DAYS_EMPLOYED']
  57.     X['CHILDREN_TO_FAMILY_MEMBERS_RATIO'] = X['CNT_CHILDREN'] / X['CNT_FAM_MEMBERS']
  58.     X['CREDIT_TO_ANNUITY_RATIO'] = X['AMT_CREDIT'] / X['AMT_ANNUITY']
  59.     X['CREDIT_TO_GOODS_RATIO'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']
  60.     X['CREDIT_TO_INCOME_RATIO'] = X['AMT_CREDIT'] / X['AMT_INCOME_TOTAL']
  61.     X['DAYS_EMPLOYED_TO_BIRTH_RATIO'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH']
  62.     X['INCOME_PER_CHILD'] = X['AMT_INCOME_TOTAL'] / (1 + X['CNT_CHILDREN'])
  63.     X['INCOME_PER_FAM_MEMBER'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS']
  64.     X['PAYMENTS_RATE'] = X['AMT_ANNUITY'] / X['AMT_CREDIT']
  65.     X['EXTERNAL_SOURCES_TOTAL'] = 2 * X['EXT_SOURCE_1'] + 3 * X['EXT_SOURCE_2'] + 4 * X['EXT_SOURCE_3']
  66.     for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
  67.         X['EXT_SOURCES_{}'.format(function_name)] = eval(f'np.{function_name}')(X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)
  68.     return X
  69.  
  70. def Build_And_Test_Model(data):
  71.     model = LGBMClassifier(n_estimators=340, max_depth=4, num_leaves=30, objective='binary')
  72.  
  73.     X = data.drop(columns=['TARGET'])
  74.     y = data['TARGET']
  75.  
  76.     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)
  77.  
  78.     model.fit(X_train, y_train)
  79.    
  80.     print(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
  81.    
  82.     return model, (X_train, y_train), (X_test, y_test)
  83.  
  84. def Feature_Importance(X_train, y_train, X_test, y_test):
  85.     clf = DecisionTreeClassifier()
  86.     clf.fit(X_train, y_train)
  87.     print("R^2 on the train set:")
  88.     print(clf.score(X_train, y_train))
  89.  
  90.     print("R^2 on the test set:")
  91.     print(clf.score(X_test, y_test))
  92.    
  93.     feature_importances = pd.Series(clf.feature_importances_, index=X_train.columns.values)
  94.     return feature_importances
  95.  
  96. def fill_nans(data, logs=False):
  97. #     fill_with_mean = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
  98.     result = data.copy()
  99.     result.dropna(subset=['SK_ID_CURR'], inplace=True)
  100.    
  101.     not_to_delete = ['OWN_CAR_AGE']
  102.     result['EXT_SOURCE_1'].fillna(0.0, inplace=True)
  103.     result['EXT_SOURCE_2'].fillna(0.0, inplace=True)
  104.     result['EXT_SOURCE_3'].fillna(0.0, inplace=True)
  105.  
  106.     to_fill_nan = result.columns[result.isna().any()].tolist()
  107.    
  108.     for feature in to_fill_nan:
  109.         count_of_nans = result[feature].isna().sum()
  110.         if (result.dtypes[feature] == 'object' or result.dtypes[feature] == 'category'):
  111.             continue
  112.         if not (feature in not_to_delete) and (count_of_nans > int(0.35 * len(result[feature]))):
  113.             result.drop(columns=[feature], inplace=True)
  114.             continue
  115.         mean = result[feature].mean()
  116.         mode = result[feature].mode()[0]
  117.         if int(mode) == mode:
  118.             mean = mode
  119.         if logs:
  120.             print(f'feature name: {feature}; mean: {mean}; mode {mode};')
  121. #         if feature in fill_with_zero:
  122. #             result[feature].fillna(0, inplace=True)  
  123.         if not (mean == np.inf) and not (mean == -np.inf) and not np.isnan(mean):
  124.             result[feature].fillna(mean, inplace=True)
  125.         elif not (mode == np.inf) and not (mode == -np.inf) and not np.isnan(mode):
  126.             result[feature].fillna(mode, inplace=True)
  127.         else:
  128.             result[feature].fillna(0, inplace=True)
  129.     return result
  130.  
  131. def Encode(data, logs=False):
  132.     df = data.copy()
  133.     encoder = OneHotEncoder(sparse_output=False)
  134.     categorical_features = df.select_dtypes(include=['object', 'category'])
  135.    
  136.     if logs:
  137.         print(f"Your categorical features: {categorical_features}")
  138.    
  139.     for feature in categorical_features:
  140.         encoded_feature = encoder.fit_transform(df[feature].values.reshape(-1, 1))
  141.  
  142.         feature_categories = encoder.categories_[0]
  143.         new_columns = [feature + '_' + str(category)
  144.                                            .replace(' ', '_')
  145.                                            .replace(':', '')
  146.                                            .replace('/', '_or_')
  147.                                            .replace(',', '')
  148.                                            .replace('-', '_')for category in feature_categories]
  149.  
  150.         encoded_df = pd.DataFrame(encoded_feature, columns=new_columns)
  151.         if logs:
  152.             print(encoded_df.info())
  153.  
  154.         df = pd.concat([df, encoded_df], axis=1)
  155.        
  156.     df.drop(categorical_features.columns, axis=1, inplace=True)
  157.     return df
  158.  
  159. def merge_data():
  160.     result = train_raw.copy()
  161.     result.replace([np.inf, -np.inf], np.nan, inplace=True)
  162.     result = result.merge( bureau_grouped
  163.                                    , left_on=['SK_ID_CURR']
  164.                                    , right_on=['SK_ID_CURR']
  165.                                    , how='left'
  166.                                    , validate='one_to_one'
  167.                                    )
  168.     result = result.merge( credit_card_grouped
  169.                                    , left_on=['SK_ID_CURR']
  170.                                    , right_on=['SK_ID_CURR']
  171.                                    , how='left'
  172.                                    , validate='one_to_one'
  173.                                    )
  174.     result = result.merge( installments_grouped
  175.                                    , left_on=['SK_ID_CURR']
  176.                                    , right_on=['SK_ID_CURR']
  177.                                    , how='left'
  178.                                    , validate='one_to_one'
  179.                                    )
  180.     result = result.merge( posh_cash_grouped
  181.                                    , left_on=['SK_ID_CURR']
  182.                                    , right_on=['SK_ID_CURR']
  183.                                    , how='left'
  184.                                    , validate='one_to_one'
  185.                                    )
  186.     result = result.merge( previous_application_grouped
  187.                                    , left_on=['SK_ID_CURR']
  188.                                    , right_on=['SK_ID_CURR']
  189.                                    , how='left'
  190.                                    , validate='one_to_one'
  191.                                    )
  192.     return result
  193.  
  194. bureau_groupby = category_engineer_bureau(bureau).groupby(by=['SK_ID_CURR'])
  195.  
  196. group_DAYS_CREDIT = bureau_groupby['DAYS_CREDIT'].agg('count').reset_index()
  197. group_CREDIT_TYPE = bureau_groupby['CREDIT_TYPE'].agg('nunique').reset_index()
  198. group_CREDIT_ACTIVE_BINARY = bureau_groupby['CREDIT_ACTIVE_BINARY'].agg('mean').reset_index()
  199. group_CREDIT_ENDDATE_NORMAL = bureau_groupby['CREDIT_ENDDATE_NORMAL'].agg('mean').reset_index()
  200. group_AMT_CREDIT_SUM_DEBT = bureau_groupby['AMT_CREDIT_SUM_DEBT'].agg('sum').reset_index()
  201. group_AMT_CREDIT_SUM = bureau_groupby['AMT_CREDIT_SUM'].agg('sum').reset_index()
  202. group_AMT_CREDIT_SUM_OVERDUE = bureau_groupby['AMT_CREDIT_SUM_OVERDUE'].agg('sum').reset_index()
  203. group_CNT_CREDIT_PROLONG = bureau_groupby['CNT_CREDIT_PROLONG'].agg('sum').reset_index()
  204.  
  205.  
  206. group_DAYS_CREDIT.rename(index=str, columns={'DAYS_CREDIT': 'COUNT_OF_PAST_LOANS'}, inplace=True)
  207. group_CREDIT_TYPE.rename(index=str, columns={'CREDIT_TYPE': 'COUNT_OF_LOAN_TYPES'}, inplace=True)
  208. group_CREDIT_ACTIVE_BINARY.rename(index=str, columns={'CREDIT_ACTIVE_BINARY': 'AVERAGE_CREDIT_ACTIVE'}, inplace=True)
  209. group_CREDIT_ENDDATE_NORMAL.rename(index=str, columns={'CREDIT_ENDDATE_NORMAL': 'AVERAGE_CREDIT_ENDDATE'}, inplace=True)
  210. group_AMT_CREDIT_SUM_DEBT.rename(index=str, columns={'AMT_CREDIT_SUM_DEBT': 'TOTAL_DEBT'}, inplace=True)
  211. group_AMT_CREDIT_SUM.rename(index=str, columns={'AMT_CREDIT_SUM': 'TOTAL_CREDIT'}, inplace=True)
  212. group_AMT_CREDIT_SUM_OVERDUE.rename(index=str, columns={'AMT_CREDIT_SUM_OVERDUE': 'TOTAL_OVERDUE'}, inplace=True)
  213. group_CNT_CREDIT_PROLONG.rename(index=str, columns={'CNT_CREDIT_PROLONG': 'TOTAL_PROLONGATION'}, inplace=True)
  214.  
  215. bureau_grouped = SK_ID.copy()
  216.  
  217. bureau_grouped = bureau_grouped.merge(group_DAYS_CREDIT, on=['SK_ID_CURR'], how='left')
  218. bureau_grouped = bureau_grouped.merge(group_CREDIT_TYPE, on=['SK_ID_CURR'], how='left')
  219. bureau_grouped = bureau_grouped.merge(group_CREDIT_ACTIVE_BINARY, on=['SK_ID_CURR'], how='left')
  220. bureau_grouped = bureau_grouped.merge(group_CREDIT_ENDDATE_NORMAL, on=['SK_ID_CURR'], how='left')
  221. bureau_grouped = bureau_grouped.merge(group_AMT_CREDIT_SUM_DEBT, on=['SK_ID_CURR'], how='left')
  222. bureau_grouped = bureau_grouped.merge(group_AMT_CREDIT_SUM, on=['SK_ID_CURR'], how='left')
  223. bureau_grouped = bureau_grouped.merge(group_AMT_CREDIT_SUM_OVERDUE, on=['SK_ID_CURR'], how='left')
  224. bureau_grouped = bureau_grouped.merge(group_CNT_CREDIT_PROLONG, on=['SK_ID_CURR'], how='left')
  225.  
  226. bureau_grouped['AVERAGE_COUNT_LOANS'] = bureau_grouped['COUNT_OF_PAST_LOANS'] / bureau_grouped['COUNT_OF_LOAN_TYPES']
  227. bureau_grouped['DEBT_CREDIT_RATIO'] = bureau_grouped['TOTAL_DEBT'] / bureau_grouped['TOTAL_CREDIT']
  228. bureau_grouped['OVERDUE_DEBT_RATIO'] = bureau_grouped['TOTAL_OVERDUE'] / bureau_grouped['TOTAL_DEBT']
  229.  
  230. credit_card_raw['CNT_INSTALMENTS'] = credit_card_raw.groupby(by=['SK_ID_CURR', 'SK_ID_PREV'])['CNT_INSTALMENT_MATURE_CUM'].agg('max').reset_index()['CNT_INSTALMENT_MATURE_CUM']
  231. credit_card_raw['MAX_LOADING'] = credit_card_raw.groupby(by=['SK_ID_CURR', 'SK_ID_PREV', 'AMT_CREDIT_LIMIT_ACTUAL']).apply(lambda x: x.AMT_BALANCE.max() / x.AMT_CREDIT_LIMIT_ACTUAL.max()).reset_index()[0]
  232.  
  233. credit_card_gourpby = credit_card_raw.groupby(by=['SK_ID_CURR'])
  234.  
  235. group_SK_ID_PREV = credit_card_gourpby['SK_ID_PREV'].agg('nunique').reset_index()
  236. group_CNT_INSTALMENTS = credit_card_gourpby['CNT_INSTALMENTS'].sum().reset_index()
  237. group_MAX_LOADING = credit_card_gourpby['MAX_LOADING'].agg('mean').reset_index()
  238. group_SK_DPD = credit_card_gourpby['SK_DPD'].agg('mean').reset_index()
  239. group_AMT_DRAWINGS_ATM_CURRENT = credit_card_gourpby['AMT_DRAWINGS_ATM_CURRENT'].agg('sum').reset_index()
  240. group_AMT_DRAWINGS_CURRENT = credit_card_gourpby['AMT_DRAWINGS_CURRENT'].agg('sum').reset_index()
  241.  
  242. group_SK_ID_PREV.rename(index=str, columns={'SK_ID_PREV': 'COUNT_OF_LOANS'}, inplace=True)
  243. group_CNT_INSTALMENTS.rename(index=str, columns={'CNT_INSTALMENTS': 'TOTAL_INSTALMENTS'}, inplace=True)
  244. group_MAX_LOADING.rename(index=str, columns={'MAX_LOADING': 'AVG_LOADING'}, inplace=True)
  245. group_SK_DPD.rename(index=str, columns={'SK_DPD': 'AVG_DPD'}, inplace=True)
  246. group_AMT_DRAWINGS_ATM_CURRENT.rename(index=str, columns={'AMT_DRAWINGS_ATM_CURRENT': 'TOTAL_DRAWINGS_ATM'}, inplace=True)
  247. group_AMT_DRAWINGS_CURRENT.rename(index=str, columns={'AMT_DRAWINGS_CURRENT': 'TOTAL_DRAWINGS'}, inplace=True)
  248.  
  249. credit_card_grouped = pd.DataFrame({'SK_ID_CURR': credit_card_raw['SK_ID_CURR'].unique()})
  250.  
  251. credit_card_grouped = credit_card_grouped.merge(group_SK_ID_PREV, on=['SK_ID_CURR'], how='left')
  252. credit_card_grouped = credit_card_grouped.merge(group_CNT_INSTALMENTS, on=['SK_ID_CURR'], how='left')
  253. credit_card_grouped = credit_card_grouped.merge(group_MAX_LOADING, on=['SK_ID_CURR'], how='left')
  254. credit_card_grouped = credit_card_grouped.merge(group_SK_DPD, on=['SK_ID_CURR'], how='left')
  255. credit_card_grouped = credit_card_grouped.merge(group_AMT_DRAWINGS_ATM_CURRENT, on=['SK_ID_CURR'], how='left')
  256. credit_card_grouped = credit_card_grouped.merge(group_AMT_DRAWINGS_CURRENT, on=['SK_ID_CURR'], how='left')
  257.  
  258.  
  259. credit_card_grouped['INSTALLMENTS_PER_LOAN'] = credit_card_grouped['TOTAL_INSTALMENTS'] / credit_card_grouped['COUNT_OF_LOANS']
  260. credit_card_grouped['CASH_RATIO'] = credit_card_grouped['TOTAL_DRAWINGS_ATM'] / credit_card_grouped['TOTAL_DRAWINGS']
  261.  
  262. previous_application_raw = pd.read_csv('/kaggle/input/full-homecredit/previous_application.csv')
  263. agg_col_names = []
  264. feature_names = [ 'AMT_ANNUITY'
  265.                 , 'AMT_APPLICATION'
  266.                 , 'AMT_CREDIT'
  267.                 , 'AMT_DOWN_PAYMENT'
  268.                 , 'AMT_GOODS_PRICE'
  269. #                 , 'RATE_DOWN_PAYMENT' слишком много пропусков
  270. #                 , 'RATE_INTEREST_PRIMARY' слишком много пропусков
  271.                 , 'DAYS_DECISION'
  272.                 , 'SELLERPLACE_AREA'
  273.                 , 'CNT_PAYMENT'
  274.                 ]
  275.  
  276. for agg_fun in ['mean', 'min', 'max', 'var']:
  277.     for column in   [ 'AMT_ANNUITY'
  278.                     , 'AMT_APPLICATION'
  279.                     , 'AMT_CREDIT'
  280.                     , 'AMT_DOWN_PAYMENT'
  281.                     , 'AMT_GOODS_PRICE'
  282.     #                 , 'RATE_DOWN_PAYMENT' слишком много пропусков
  283.     #                 , 'RATE_INTEREST_PRIMARY' слишком много пропусков
  284.                     , 'DAYS_DECISION'
  285.                     , 'SELLERPLACE_AREA'
  286.                     , 'CNT_PAYMENT'
  287.                     ]:
  288.         agg_col_names.append((column, agg_fun))
  289. previous_application_grouped = pd.DataFrame({'SK_ID_CURR': previous_application_raw['SK_ID_CURR'].unique()})
  290. previous_application_groupby = previous_application_raw[['SK_ID_CURR'] + feature_names].groupby('SK_ID_CURR')
  291. print('CHECKPOINT')
  292. for column_name, agg in agg_col_names:
  293.     print(f'CHECKPOINT-{column_name}-{agg}')
  294.     new_col_name = '{}_{}'.format(column_name, agg)
  295.     cur_group = previous_application_groupby.agg(agg).reset_index().rename( index=str
  296.                                                                , columns={column_name: new_col_name})[['SK_ID_CURR', new_col_name]]
  297.     previous_application_grouped.merge(cur_group, on=['SK_ID_CURR'], how='left')
  298.  
  299. agg_col_names = []
  300.  
  301. for agg_fun in ['min', 'max', 'sum', 'mean']:
  302.     for column in [ 'AMT_INSTALMENT'
  303.                   , 'AMT_PAYMENT'
  304.                   , 'DAYS_ENTRY_PAYMENT'
  305.                   , 'DAYS_INSTALMENT'
  306.                   , 'NUM_INSTALMENT_NUMBER'
  307.                   , 'NUM_INSTALMENT_VERSION'
  308.                   ]:
  309.         agg_col_names.append((column, agg_fun))
  310.        
  311. installments_grouped = pd.DataFrame({'SK_ID_CURR': installments_raw['SK_ID_CURR'].unique()})
  312. installments_groupby = installments_raw.groupby('SK_ID_CURR')
  313. for column_name, agg in agg_col_names:
  314.     new_col_name = '{}_{}'.format(column_name, agg)
  315.     cur_group = installments_groupby.agg(agg).reset_index().rename( index=str
  316.                                                                   , columns={column_name: new_col_name})[['SK_ID_CURR', new_col_name]]
  317.     installments_grouped.merge(cur_group, on=['SK_ID_CURR'], how='left')
  318.    
  319. agg_col_names = []
  320. feature_names = [ 'MONTHS_BALANCE'
  321.                 , 'CNT_INSTALMENT'
  322.                 , 'CNT_INSTALMENT_FUTURE'
  323.                 , 'SK_DPD'
  324.                 , 'SK_DPD_DEF'
  325.                 ]
  326.  
  327. for agg_fun in ['mean', 'min', 'max', 'var']:
  328.     for column in [ 'MONTHS_BALANCE'
  329.                   , 'CNT_INSTALMENT'
  330.                   , 'CNT_INSTALMENT_FUTURE'
  331.                   , 'SK_DPD'
  332.                   , 'SK_DPD_DEF'
  333.                   ]:
  334.         agg_col_names.append((column, agg_fun))
  335.        
  336. posh_cash_grouped = pd.DataFrame({'SK_ID_CURR': posh_cash_raw['SK_ID_CURR'].unique()})
  337. posh_cash_groupby = posh_cash_raw[['SK_ID_CURR'] + feature_names].groupby('SK_ID_CURR')
  338. print('CHECKPOINT')
  339. for column_name, agg in agg_col_names:
  340.     print(f'CHECKPOINT-{column_name}-{agg}')
  341.     new_col_name = '{}_{}'.format(column_name, agg)
  342.     cur_group = posh_cash_groupby.agg(agg).reset_index().rename( index=str
  343.                                                                , columns={column_name: new_col_name})[['SK_ID_CURR', new_col_name]]
  344.     posh_cash_grouped.merge(cur_group, on=['SK_ID_CURR'], how='left')
  345.    
  346. train_valid = merge_data()
  347.  
  348. # train_valid = object_feature_engineering(train_valid)
  349. train_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
  350. train_valid = fill_nans(train_valid)
  351. train_valid = Encode(train_valid, logs=False)
  352.  
  353. train_valid = custom_features(train_valid)
  354. train_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
  355. train_valid = fill_nans(train_valid)
  356.  
  357. model_valid, (X_train, y_train), (X_test, y_test) = Build_And_Test_Model(train_valid)
  358. feature_importance = Feature_Importance(X_train, y_train, X_test, y_test)
  359. feature_importance = feature_importance.sort_values(ascending=False)
  360. feature_importance[feature_importance > 0].index
  361. test_features = feature_importance[feature_importance > 0].index.tolist()
  362. train_valied_with_feature_importance = train_valid[test_features + ['TARGET']]
  363. _, (X_train, y_train), (X_test, y_test) = Build_And_Test_Model(train_valied_with_feature_importance)
  364.  
  365. cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
  366.  
  367. X = train_valied_with_feature_importance.drop(columns=['TARGET'])
  368. y = train_valied_with_feature_importance['TARGET']
  369.  
  370. model = LGBMClassifier(n_estimators=340, max_depth=4, num_leaves=30, objective='binary')
  371. scores = cross_validate(model, X, y, cv=cv, scoring='roc_auc', return_estimator=True)
  372.  
  373. mean_score = scores['test_score'].mean()
  374. std_score = scores['test_score'].std()
  375. print(f'ROC_AUC : {mean_score} +/- {std_score}')
  376. best_estimator = scores['estimator'][scores['test_score'].argmax()]
  377. print(roc_auc_score(y_test, best_estimator.predict_proba(X_test)[:, 1]))
Advertisement
Add Comment
Please, Sign In to add comment