Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import aiofiles
- import aiohttp
- import asyncio
- import async_timeout
- import os
- import numpy as np
- from PIL import Image
- import io
- import json
- from skimage.transform import resize
- import tarfile
- def make_tarfile(output_filename, source_dir):
- with tarfile.open(output_filename, "w:gz") as tar:
- tar.add(source_dir, arcname=os.path.basename(source_dir))
- async def download(session, url, idx, output_folder):
- async with session.get(url) as response:
- filename = os.path.basename(url)
- async with aiofiles.open( output_folder + str(idx) + '.jpeg', 'wb') as fd:
- while True:
- chunk = await response.content.read(1024)
- if not chunk:
- break
- await fd.write(chunk)
- print("Image: ", idx, output_folder.split('/')[-2])
- return await response.release()
- async def bound_download(sem, session, url, idx, output_folder):
- async with sem:
- await download(session, url, idx, output_folder)
- async def load_data(output_folder, output_filename, json_path):
- # create instance of Semaphore
- sem = asyncio.Semaphore(20)
- tasks = []
- with open(json_path, 'r') as f: # json file containing urls to train and validation
- documents = json.load(f)
- images = documents.get('images')
- async with aiohttp.ClientSession(loop=loop) as session:
- for idx, img in enumerate(images):
- task = asyncio.ensure_future(bound_download(sem, session, img.get('url'), idx, output_folder))
- tasks.append(task)
- responses = asyncio.gather(*tasks)
- await responses
- # Compression
- make_tarfile(output_filename, output_folder)
- async def main(loop):
- dataset = os.environ.get('DATASET')
- sub_folder = 'dataset/'
- train_output_folder = dataset + sub_folder + 'train/'
- validation_output_folder = dataset + sub_folder + 'validation/'
- test_output_folder = dataset + sub_folder + 'test/'
- train_output_file = dataset + sub_folder + 'train.tar.gz'
- validation_output_file = dataset + sub_folder + 'validation.tar.gz'
- test_output_file = dataset + sub_folder + 'test.tar.gz'
- json_train = dataset + 'train.json'
- json_validation = dataset + 'validation.json'
- json_test = dataset + 'test.json'
- if not os.path.exists(train_output_folder):
- os.makedirs(train_output_folder)
- if not os.path.exists(validation_output_folder):
- os.makedirs(validation_output_folder)
- if not os.path.exists(test_output_folder):
- os.makedirs(test_output_folder)
- await load_data(train_output_folder, train_output_file, json_train)
- await load_data(validation_output_folder, validation_output_file, json_validation)
- await load_data(test_output_folder, test_output_file, json_test)
- if __name__ == '__main__':
- loop = asyncio.get_event_loop()
- loop.run_until_complete(main(loop))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement