Untitled

dataset = CustomDataset(items)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_PREPROCESS_WORKERS, drop_last=False, collate_fn=filtered_collate_fn)

if len(data_loader) == 0:
  print('Nothing to segment in {}', items)
  return


curr_items = BATCH_SIZE * len(data_loader)
print('---> processing dataloader with {} batches of BATCH SIZE {} each at {}'.format(len(data_loader), BATCH_SIZE, datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))

st = time.time()
cids, features, meta = [], [], []
# note each of *_batch is a tuple
for batch_idx, (id_batch, img_batch, tag_batch) in enumerate(data_loader):
  try:
    mask_batch = self.model.forward_pass(img_batch)
    ids = [*ids, *list(id_batch)]
    np_masks = [*np_masks, *list(mask_batch)]
    meta = [*meta, *list(meta_batch)]

print('--> finished batch processing dataloader with {} batches of BATCH SIZE {} each at {}, Overall time taken : {:.4f} seconds'.format(len(data_loader), BATCH_SIZE, datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), time.time() - st))


checkpoint1 = time.time()

filename_pattern = 'chunk-{}.json'.format(get_random_string_with_timestamp())


items = []
for cid, feature, tag in zip(cids, np_masks, meta):
  json_obj = {
                "cid" : str(cid),
                "feature" : base64.b64encode(gzip.compress(feature)).decode(), # compress a high dimensional vector
                "tag" : str(tag)
             }

items.append(json.dumps(json_obj))
file_data = '\n'.join(items)


filename = os.path.join(s3_bucket_name, filename_pattern)
S3_utils.write_data_to_bucket(boto3_s3_client, filename, file_data)

print('===> saving inference data to {}, Count {}, overall time taken {:.4f}'.format(filename, len(cids), time.time() - checkpoint))