Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from dask.distributed import Client
- import pandas as pd
- import xarray as xr
- import s3fs
- import time
- if __name__ == '__main__':
- # use however many processors are available on local machine
- client = Client()
- print(client)
- # Input: list of netcdf file names
- root = '/projects/water/nwm/data/forcing_short_range/'
- dates = pd.date_range(start='2018-03-02T00:00', end='2018-04-11T00:00', freq='H')
- uris = ['{}{}/nwm.t{}z.short_range.forcing.f001.conus.nc'.format(root,a.strftime('%Y%m%d'),a.strftime('%H')) for a in dates]
- # Output: S3 Bucket
- f_zarr = 'rsignell/nwm/test04'
- # Read data using xarray for lazy evaluation and parallel execution
- ds = xr.open_mfdataset(uris,concat_dim='time')
- # Drop these problematic, unimportant variables
- ds = ds.drop(['ProjectionCoordinateSystem','time_bounds'])
- # Write data using xarray.to_zarr
- fs = s3fs.S3FileSystem(anon=False)
- d = s3fs.S3Map(f_zarr, s3=fs)
- ds.to_zarr(store=d, mode='w')
Add Comment
Please, Sign In to add comment