Advertisement
chaarse

Untitled

Mar 8th, 2025
194
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.60 KB | None | 0 0
  1. !wget --no-check-certificate 'https://www.dropbox.com/s/5iuef7c9ljj84t6/train_transaction.csv?dl=0' -O train_transaction.csv
  2. !wget --no-check-certificate 'https://www.dropbox.com/s/cmy01z5fw7ohlmd/train_identity.csv?dl=0' -O train_identity.csv
  3. !wget --no-check-certificate 'https://www.dropbox.com/s/7thqkuxnwsa7njj/test_transaction.csv?dl=0' -O test_transaction.csv
  4. !wget --no-check-certificate 'https://www.dropbox.com/s/b40nvbb9e2usd5w/test_identity.csv?dl=0' -O test_identity.csv
  5. !wget --no-check-certificate 'https://www.dropbox.com/s/arkyoz0bel8z4d2/sample_submission.csv?dl=0' -O sample_submission.csv
  6.  
  7. !pip install catboost -q
  8.  
  9. def reduce_mem_usage(df):
  10.     NAlist = [] # Keeps track of columns that have missing values filled in.
  11.     for col in tqdm(df.columns):
  12.         if df[col].dtype != object:  # Exclude strings
  13.  
  14.             # make variables for Int, max and min
  15.             IsInt = False
  16.             col_max_value = df[col].max()
  17.             col_min_value = df[col].min()
  18.  
  19.             # Integer does not support NA, therefore, NA needs to be filled
  20.             if not np.isfinite(df[col]).all():
  21.                 NAlist.append(col)
  22.                 df[col].fillna(col_min_value - 1, inplace=True)
  23.  
  24.             # test if column can be converted to an integer
  25.             col_as_int = df[col].fillna(0).astype(np.int64)
  26.             diff = (df[col] - col_as_int)
  27.             diff = diff.sum()
  28.             if np.abs(diff) < 0.01:
  29.                 IsInt = True
  30.  
  31.             # Make Integer/unsigned Integer datatypes
  32.             if pd.api.types.is_integer_dtype(df[col]):
  33.                 if col_min_value >= 0:
  34.                     if col_max_value < 255:
  35.                         df[col] = df[col].astype(np.uint8)
  36.                     elif col_max_value < 65535:
  37.                         df[col] = df[col].astype(np.uint16)
  38.                     elif col_max_value < 4294967295:
  39.                         df[col] = df[col].astype(np.uint32)
  40.                     else:
  41.                         df[col] = df[col].astype(np.uint64)
  42.                 else:
  43.                     if col_min_value > np.iinfo(np.int8).min and col_max_value < np.iinfo(np.int8).max:
  44.                         df[col] = df[col].astype(np.int8)
  45.                     elif col_min_value > np.iinfo(np.int16).min and col_max_value < np.iinfo(np.int16).max:
  46.                         df[col] = df[col].astype(np.int16)
  47.                     elif col_min_value > np.iinfo(np.int32).min and col_max_value < np.iinfo(np.int32).max:
  48.                         df[col] = df[col].astype(np.int32)
  49.                     elif col_min_value > np.iinfo(np.int64).min and col_max_value < np.iinfo(np.int64).max:
  50.                         df[col] = df[col].astype(np.int64)    
  51.  
  52.             # Make float datatypes 32 bit
  53.             else:
  54.                 df[col] = df[col].astype(np.float32)
  55.  
  56.     return df, NAlist
  57.  
  58. INPUT_DIR = '.'
  59.  
  60. train_transaction = pd.read_csv(os.path.join(INPUT_DIR, 'train_transaction.csv'))
  61. train_identity = pd.read_csv(os.path.join(INPUT_DIR, 'train_identity.csv'))
  62. test_transaction = pd.read_csv(os.path.join(INPUT_DIR, 'test_transaction.csv'))
  63. test_identity = pd.read_csv(os.path.join(INPUT_DIR, 'test_identity.csv'))
  64. sample_submission = pd.read_csv(os.path.join(INPUT_DIR, 'sample_submission.csv'))
  65.  
  66. df_train = train_transaction.merge(train_identity, how='left', on='TransactionID')
  67. del train_transaction, train_identity
  68. df_train, df_train_NAlist = reduce_mem_usage(df_train)
  69.  
  70. df_test = test_transaction.merge(test_identity, how='left', on='TransactionID')
  71. del test_transaction, test_identity
  72. df_test, df_test_NAlist = reduce_mem_usage(df_test)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement