Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- !wget --no-check-certificate 'https://www.dropbox.com/s/5iuef7c9ljj84t6/train_transaction.csv?dl=0' -O train_transaction.csv
- !wget --no-check-certificate 'https://www.dropbox.com/s/cmy01z5fw7ohlmd/train_identity.csv?dl=0' -O train_identity.csv
- !wget --no-check-certificate 'https://www.dropbox.com/s/7thqkuxnwsa7njj/test_transaction.csv?dl=0' -O test_transaction.csv
- !wget --no-check-certificate 'https://www.dropbox.com/s/b40nvbb9e2usd5w/test_identity.csv?dl=0' -O test_identity.csv
- !wget --no-check-certificate 'https://www.dropbox.com/s/arkyoz0bel8z4d2/sample_submission.csv?dl=0' -O sample_submission.csv
- !pip install catboost -q
- def reduce_mem_usage(df):
- NAlist = [] # Keeps track of columns that have missing values filled in.
- for col in tqdm(df.columns):
- if df[col].dtype != object: # Exclude strings
- # make variables for Int, max and min
- IsInt = False
- col_max_value = df[col].max()
- col_min_value = df[col].min()
- # Integer does not support NA, therefore, NA needs to be filled
- if not np.isfinite(df[col]).all():
- NAlist.append(col)
- df[col].fillna(col_min_value - 1, inplace=True)
- # test if column can be converted to an integer
- col_as_int = df[col].fillna(0).astype(np.int64)
- diff = (df[col] - col_as_int)
- diff = diff.sum()
- if np.abs(diff) < 0.01:
- IsInt = True
- # Make Integer/unsigned Integer datatypes
- if pd.api.types.is_integer_dtype(df[col]):
- if col_min_value >= 0:
- if col_max_value < 255:
- df[col] = df[col].astype(np.uint8)
- elif col_max_value < 65535:
- df[col] = df[col].astype(np.uint16)
- elif col_max_value < 4294967295:
- df[col] = df[col].astype(np.uint32)
- else:
- df[col] = df[col].astype(np.uint64)
- else:
- if col_min_value > np.iinfo(np.int8).min and col_max_value < np.iinfo(np.int8).max:
- df[col] = df[col].astype(np.int8)
- elif col_min_value > np.iinfo(np.int16).min and col_max_value < np.iinfo(np.int16).max:
- df[col] = df[col].astype(np.int16)
- elif col_min_value > np.iinfo(np.int32).min and col_max_value < np.iinfo(np.int32).max:
- df[col] = df[col].astype(np.int32)
- elif col_min_value > np.iinfo(np.int64).min and col_max_value < np.iinfo(np.int64).max:
- df[col] = df[col].astype(np.int64)
- # Make float datatypes 32 bit
- else:
- df[col] = df[col].astype(np.float32)
- return df, NAlist
- INPUT_DIR = '.'
- train_transaction = pd.read_csv(os.path.join(INPUT_DIR, 'train_transaction.csv'))
- train_identity = pd.read_csv(os.path.join(INPUT_DIR, 'train_identity.csv'))
- test_transaction = pd.read_csv(os.path.join(INPUT_DIR, 'test_transaction.csv'))
- test_identity = pd.read_csv(os.path.join(INPUT_DIR, 'test_identity.csv'))
- sample_submission = pd.read_csv(os.path.join(INPUT_DIR, 'sample_submission.csv'))
- df_train = train_transaction.merge(train_identity, how='left', on='TransactionID')
- del train_transaction, train_identity
- df_train, df_train_NAlist = reduce_mem_usage(df_train)
- df_test = test_transaction.merge(test_identity, how='left', on='TransactionID')
- del test_transaction, test_identity
- df_test, df_test_NAlist = reduce_mem_usage(df_test)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement