Advertisement
Guest User

Untitled

a guest
Apr 22nd, 2018
58
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.92 KB | None | 0 0
  1. import aiofiles
  2. import aiohttp
  3. import asyncio
  4. import async_timeout
  5. import os
  6. import numpy as np
  7. from PIL import Image
  8. import io
  9. import json
  10. from skimage.transform import resize
  11. import tarfile
  12.  
  13.  
  14. def make_tarfile(output_filename, source_dir):
  15.     with tarfile.open(output_filename, "w:gz") as tar:
  16.         tar.add(source_dir, arcname=os.path.basename(source_dir))
  17.  
  18. async def download(session, url, idx, output_folder):
  19.  
  20.     async with session.get(url) as response:
  21.         filename = os.path.basename(url)
  22.         async with aiofiles.open( output_folder + str(idx) + '.jpeg', 'wb') as fd:
  23.             while True:
  24.                 chunk = await response.content.read(1024)
  25.                 if not chunk:
  26.                     break
  27.                 await fd.write(chunk)
  28.         print("Image: ", idx, output_folder.split('/')[-2])
  29.         return await response.release()
  30.  
  31.  
  32. async def bound_download(sem, session, url, idx, output_folder):
  33.  
  34.     async with sem:
  35.         await download(session, url, idx, output_folder)
  36.  
  37. async def load_data(output_folder, output_filename, json_path):
  38.  
  39.    # create instance of Semaphore
  40.     sem = asyncio.Semaphore(20)
  41.     tasks = []
  42.  
  43.     with open(json_path, 'r') as f: # json file containing urls to train and validation
  44.         documents = json.load(f)
  45.  
  46.     images = documents.get('images')
  47.  
  48.     async with aiohttp.ClientSession(loop=loop) as session:
  49.         for idx, img in enumerate(images):
  50.             task = asyncio.ensure_future(bound_download(sem, session, img.get('url'), idx, output_folder))
  51.             tasks.append(task)
  52.  
  53.         responses = asyncio.gather(*tasks)
  54.         await responses
  55.  
  56.     # Compression
  57.     make_tarfile(output_filename, output_folder)
  58.  
  59. async def main(loop):
  60.  
  61.     dataset = os.environ.get('DATASET')
  62.     sub_folder = 'dataset/'
  63.  
  64.     train_output_folder =  dataset + sub_folder + 'train/'
  65.     validation_output_folder = dataset + sub_folder + 'validation/'
  66.     test_output_folder = dataset + sub_folder + 'test/'
  67.  
  68.     train_output_file = dataset + sub_folder + 'train.tar.gz'
  69.     validation_output_file = dataset + sub_folder + 'validation.tar.gz'
  70.     test_output_file = dataset + sub_folder + 'test.tar.gz'
  71.  
  72.     json_train = dataset + 'train.json'
  73.     json_validation = dataset + 'validation.json'
  74.     json_test = dataset + 'test.json'
  75.  
  76.     if not os.path.exists(train_output_folder):
  77.         os.makedirs(train_output_folder)
  78.  
  79.     if not os.path.exists(validation_output_folder):
  80.         os.makedirs(validation_output_folder)
  81.  
  82.     if not os.path.exists(test_output_folder):
  83.         os.makedirs(test_output_folder)
  84.  
  85.     await load_data(train_output_folder, train_output_file, json_train)
  86.     await load_data(validation_output_folder, validation_output_file, json_validation)
  87.     await load_data(test_output_folder, test_output_file, json_test)
  88.  
  89. if __name__ == '__main__':
  90.     loop = asyncio.get_event_loop()
  91.     loop.run_until_complete(main(loop))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement