Untitled

from dask.distributed import Client
import pandas as pd
import xarray as xr
import s3fs
import time

if __name__ == '__main__':
    # use however many processors are available on local machine
    client = Client()
    print(client)

    # Input: list of netcdf file names
    root = '/projects/water/nwm/data/forcing_short_range/'
    dates = pd.date_range(start='2018-03-02T00:00', end='2018-04-11T00:00', freq='H')
    uris = ['{}{}/nwm.t{}z.short_range.forcing.f001.conus.nc'.format(root,a.strftime('%Y%m%d'),a.strftime('%H')) for a in dates]

    # Output: S3 Bucket
    f_zarr = 'rsignell/nwm/test04'

    # Read data using xarray for lazy evaluation and parallel execution
    ds = xr.open_mfdataset(uris,concat_dim='time')

    # Drop these problematic, unimportant variables
    ds = ds.drop(['ProjectionCoordinateSystem','time_bounds'])

    # Write data using xarray.to_zarr
    fs = s3fs.S3FileSystem(anon=False)
    d = s3fs.S3Map(f_zarr, s3=fs)
    ds.to_zarr(store=d, mode='w')