Guest User

Combine multiple netcdf files

a guest
Aug 3rd, 2018
639
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 14.24 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import numpy as np
  4. import pandas as pd
  5. import xarray as xr
  6. from netCDF4 import Dataset
  7. import sys
  8.  
  9.  
  10. def combine_boutdata(datadir='.', prefix='/BOUT.dmp.', inpfile='./BOUT.inp', output_file_prefix='boutdata',
  11.                      save_fields_separately=False, ignore_fields=None, only_save_fields=None,
  12.                      save_grid=True, save_dtype='float32'):
  13.  
  14.     # TODO allow for reading of multiple different runs in different directories, and return as a list of datasets
  15.     # TODO save information about the directory the data was loaded from
  16.  
  17.     # Read BOUT.inp file and get nested dictionary of options
  18.     opts = read_options(inpfile)
  19.  
  20.     # Read first BOUT dump file (e.g. BOUT.dmp.0.nc) to get metadata and info on data variables to save
  21.     first_dump_file = datadir + prefix + '0.nc'
  22.     with xr.open_dataset(first_dump_file) as ds:
  23.  
  24.         # Read metadata and group variables by type
  25.         metadata, evolving_vars, spatial_vars = read_metadata(ds)
  26.  
  27.         # Only need to collect non-scalar variables which we care about
  28.         excluded_vars = list(metadata.keys()) + ignore_fields
  29.         if not save_grid:
  30.             grid_variables = ['dx', 'dy', 'J',
  31.                               'g11',  'g22',  'g33',  'g12',  'g13',  'g23',
  32.                               'g_11', 'g_22', 'g_33', 'g_12', 'g_13', 'g_23']
  33.             excluded_vars.extend(grid_variables)
  34.  
  35.         # Anything which is time-evolving or spatially-dependent should be collected and combined from all dump files
  36.         data_vars_to_save = list((set(evolving_vars) | set(spatial_vars)) - set(excluded_vars))
  37.         print('Will save the fields:\n    ' + str(data_vars_to_save))
  38.  
  39.         dim_sizes, var_dims = get_dimensions(ds, metadata, data_vars_to_save)
  40.  
  41.     if not save_fields_separately:
  42.         print('Will save all fields...')
  43.         # Combine data from all dump files into one, also saving the data from the input file and the metadata
  44.         savedata_to_nc(dim_sizes=dim_sizes, var_dims=var_dims, metadata=metadata, options=opts,
  45.                        to_collect=data_vars_to_save, output_file_prefix=output_file_prefix, save_dtype=save_dtype)
  46.  
  47.     else:
  48.         # Establish which fields need to be saved to separate files
  49.         if only_save_fields is not None:
  50.             if set(only_save_fields).issubset(evolving_vars):
  51.                 fields_to_save_separately = set(only_save_fields).intersection(set(spatial_vars)) - set(excluded_vars)
  52.             else:
  53.                 raise ValueError('The options only_save_fields is for specifying which of the time-evolving variables '
  54.                                  'you wish to save')
  55.         else:
  56.             fields_to_save_separately = set(evolving_vars).intersection(set(spatial_vars)) - set(excluded_vars)
  57.         print('Will save each of the fields ' + str(fields_to_save_separately) + ' in a separate file')
  58.  
  59.         for field in fields_to_save_separately:
  60.             print('Saving field ' + field + '...')
  61.             to_save = (set(data_vars_to_save) - fields_to_save_separately).union({field})
  62.            
  63.             single_output_file_prefix = output_file_prefix + '_' + field
  64.             savedata_to_nc(dim_sizes=dim_sizes, var_dims=var_dims, metadata=metadata, options=opts,
  65.                            to_collect=list(to_save), output_file_prefix=single_output_file_prefix,
  66.                            save_dtype=save_dtype)
  67.  
  68.  
  69. def read_options(input_file):
  70.     """
  71.    Uses class written by Ben (from BOUT-dev/tools/pylib/boutdata/data.py) to read in tree of all options
  72.    specified in BOUT.inp file, and return them as a nested dictionary.
  73.    """
  74.  
  75.     from boutdata import data
  76.     import pprint
  77.  
  78.     print('Reading the BOUT.inp options file:')
  79.     opts = data.BoutOptionsFile(input_file)
  80.  
  81.     print('Read in options:\n')
  82.     opts_dict = opts.as_dict()
  83.     pprint.pprint(opts_dict)
  84.  
  85.     return opts_dict
  86.  
  87.  
  88. def read_metadata(ds):
  89.     """
  90.    Reads the file BOUT.dmp.0.nc and returns the t_array as a numpy array, a list of the variables corresponding to
  91.    multidimensional data, and all remaining metadata as an array
  92.    """
  93.  
  94.     # TODO work out what to do with data which depends on t only but is not t_array
  95.  
  96.     # Determine which arrays represent what kind of data by examining their dimensions
  97.     variables = list(ds.variables)
  98.     metadata_vars, evolving_vars, spatial_vars = sort_variables_by_dimensions(variables, ds.data_vars)
  99.  
  100.     # Save metadata as a dictionary so that it can be stored as the attributes of the final DataSet
  101.     # Assumes all the metadata are numpy scalars
  102.     # (which they should be because they have been filtered for t,x,y,z dependence)
  103.     metadata = scalar_vars_to_dict(ds[metadata_vars], metadata_vars)
  104.     print('Found metadata:\n    ' + str(metadata))
  105.  
  106.     return metadata, evolving_vars, spatial_vars
  107.  
  108.  
  109. def sort_variables_by_dimensions(variables, data):
  110.     """
  111.    Determine which arrays represent what kind of data by examining their dimensions
  112.    """
  113.  
  114.     # All variables must either be scalars (metadata), have t dependence (evolving), or x,y or z dependence (spatial)
  115.     # Variables can and will appear in multiple lists
  116.     evolving_vars = [var for var in variables if 't' in data[var].dims]
  117.     spatial_vars = [var for var in variables if {'x','y','z'}.intersection(data[var].dims)]
  118.     metadata_vars = list(set(variables) - set(evolving_vars) - set(spatial_vars))
  119.  
  120.     return metadata_vars, evolving_vars, spatial_vars
  121.  
  122.  
  123. def scalar_vars_to_dict(ds, vars):
  124.     """
  125.    Converts all remaining metadata read from the first dump file into a dictionary.
  126.    """
  127.  
  128.     metadata_keys = list(ds.variables)
  129.     try:
  130.         metadata_values = [np.asscalar(ds[var].values) for var in vars]
  131.     except ValueError('Remaining metadata is assumed to be a scalar value'):
  132.         sys.exit(1)
  133.     metadata = dict(zip(metadata_keys, metadata_values))
  134.  
  135.     return metadata
  136.  
  137.  
  138. def get_dimensions(ds, metadata, vars_to_collect):
  139.     """
  140.    Returns a dictionary of the overall sizes of each dimension of the dataset, and also a dictionary of the
  141.    dimensions possessed by each variable in the dataset.
  142.    """
  143.  
  144.     # TODO find out if the assumptions made in this function about guard cells are generally true or not
  145.  
  146.     dims = ds.dims
  147.     nx = metadata['nx']  # BOUT already includes x guard cells
  148.     ny = metadata['ny'] + 2*metadata['MYG']  # assumes guard cells on both ends of y domain
  149.     nz = dims['z']  # BOUT doesn't parallelize in z
  150.     dim_sizes = {'t': dims['t'], 'x': nx, 'y': ny, 'z': nz}
  151.     print('Overall simulation domain has dimensions:\n    ' + str(dim_sizes))
  152.  
  153.     # Need to know the dimensions of each variable individually in order to allocate space in the output NetCDF file
  154.     vars_dims = {var: ds[var].dims for var in vars_to_collect}
  155.  
  156.     return dim_sizes, vars_dims
  157.  
  158.  
  159. def savedata_to_nc(dim_sizes, var_dims, metadata, options, to_collect,
  160.                    output_file_prefix='boutdata', save_dtype='float32'):
  161.     """
  162.    Collects data from BOUT.dmp.*.nc files in the current directory and saves as a single NetCDF file.
  163.  
  164.    Also stores the given metadata and options in the attributes dictionary of the NetCDF DataSet.
  165.    """
  166.  
  167.     attributes = combine_metadata(options, metadata)
  168.  
  169.     # Read data from BOUT.dmp.*.nc files
  170.     files_to_load_prefix = 'BOUT.dmp.'
  171.  
  172.     collect_parallelised_data(files_to_load_prefix, output_file_prefix,
  173.                               to_collect, dim_sizes, var_dims, attributes,
  174.                               nxpe=metadata['NXPE'], mxg=metadata['MXG'], mxsub=metadata['MXSUB'],
  175.                               nype=metadata['NYPE'], myg=metadata['MYG'], mysub=metadata['MYSUB'],
  176.                               save_dtype=save_dtype)
  177.  
  178.  
  179. def collect_parallelised_data(files_to_load_prefix, output_file_prefix,
  180.                               vars_to_collect, dim_sizes, vars_dims,  attributes,
  181.                               nxpe, nype, mxsub, mysub, mxg, myg,
  182.                               save_dtype='float32'):
  183.  
  184.     # Create output file with full dimensions to fill up later
  185.     output_file = initialise_output_file(output_file_prefix, vars_to_collect, dim_sizes, vars_dims, save_dtype)
  186.  
  187.     # Collect all data that was split up by parallelisation
  188.     combine_dump_data(files_to_load_prefix, output_file,
  189.                        vars_to_collect, vars_dims,
  190.                        nxpe, nype, mxsub, mysub, mxg, myg)
  191.  
  192.     # Add attributes dictionary
  193.     print('\nSaving input options and metadata into attributes dictionary of output file...')
  194.     output_file.setncatts(attributes)
  195.  
  196.     # Close output file
  197.     print('Finished - closing output file...')
  198.     output_file.close()
  199.  
  200.  
  201. def initialise_output_file(output_file_prefix, variables_to_collect, dim_sizes, variables_dims, save_dtype='float32'):
  202.  
  203.     print('Creating new NetCDF file to store output in...')
  204.     output = Dataset(output_file_prefix + '.nc', "w", format="NETCDF4")
  205.  
  206.     print('Allocating memory for variables in output NetCDF file...')
  207.     # dims_sizes should be of whole dataset, e.g. {'t': 14, 'x': 196, 'y': 68, 'z': 128}
  208.     for dim, size in dim_sizes.items():
  209.         output.createDimension(dim, size)
  210.  
  211.     # Create variables to write to in target NetCDF file
  212.     if save_dtype == 'float32':
  213.         dtype = 'f4'
  214.     elif save_dtype == 'float64':
  215.         dtype = 'f8'
  216.     else:
  217.         dtype = save_dtype
  218.     for variable in variables_to_collect:
  219.         output.createVariable(variable, dtype, variables_dims[variable])
  220.  
  221.     return output
  222.  
  223.  
  224. def combine_metadata(options, metadata):
  225.     """
  226.    Flatten the options dictionary and combine it with the metadata dictionary, to return a combined attributes dict.
  227.    """
  228.  
  229.     opts= {'input': options}
  230.  
  231.     meta = {'meta': metadata}
  232.     attributes = {**meta, **opts}
  233.  
  234.     # Use a Pandas function to flatten the dictionary
  235.     df = pd.io.json.json_normalize(attributes)
  236.     flat_attrs = df.to_dict(orient='records')[0]
  237.  
  238.     # Replace '_' separators with ':' to mimic the BOUT options file structure
  239.     flat_attrs_colons = dict(zip([x.replace('.', ':') for x in flat_attrs.keys()], flat_attrs.values()))
  240.  
  241.     # NetCDF can't save an attributes dictionary where any of the keys contain slash characters, so remove them
  242.     # TODO decide on the best way to deal with keys containing slashes in general
  243.     slashless_attributes = {key.replace('/', ''): val for key, val in flat_attrs_colons.items()}
  244.  
  245.     return slashless_attributes
  246.  
  247.  
  248. def combine_dump_data(files_to_load_prefix, output_file_handle,
  249.                       vars_to_collect, vars_dims,
  250.                       nxpe, nype, mxsub, mysub, mxg, myg):
  251.     """
  252.    Given a handle to a correctly set-up output NetCDF file, will collect all distributed variables from BOUT dmp files,
  253.    and write them into the output file.
  254.  
  255.    Only opens one dmp.*.nc file at a time to minimise RAM usage
  256.    """
  257.  
  258.     # Loop over files to load from
  259.     for j in range(nype):
  260.         for i in range(nxpe):
  261.  
  262.             # Open one dump file
  263.             file_num = (i + nxpe * j)
  264.             dmp_file = files_to_load_prefix + str(file_num) + '.nc'
  265.             print('Reading file ' + str(dmp_file) + '...', end='\r')
  266.             dmp_ds = Dataset(dmp_file, "r", format="NETCDF4")
  267.  
  268.             # Forget about variables which aren't spread across multiple files
  269.             for variable in vars_to_collect:
  270.                 var_dims = vars_dims[variable]
  271.  
  272.                 dump_data = dmp_ds.variables[variable][:]
  273.  
  274.                 # Remove ghost cells
  275.                 trimmed = remove_ghost(dump_data, i, j, mxg=mxg, myg=myg, nxpe=nxpe, nype=nype, dims=var_dims)
  276.  
  277.                 # Insert chunk into output file
  278.                 insertion_loc = where_to_insert_dump_contents(i, j, mxsub, mysub, mxg, myg, nxpe, nype, var_dims)
  279.                 output_file_handle.variables[variable][insertion_loc] = trimmed
  280.  
  281.             # Close dump file
  282.             dmp_ds.close()
  283.  
  284.  
  285. def remove_ghost(fat, i, j, mxg, myg, nxpe, nype, dims):
  286.     """
  287.    Removes all ghost cells from the currently loaded dump file, but leaves the guard cells alone.
  288.    """
  289.  
  290.     if i == 0:
  291.         xslice = slice(mxg, None, None)
  292.     elif i == nxpe - 1:
  293.         xslice = slice(0, -mxg, None)
  294.     else:
  295.         xslice = slice(mxg, -mxg, None)
  296.  
  297.     if j == 0:
  298.         yslice = slice(myg, None, None)
  299.     elif j == nype - 1:
  300.         yslice = slice(0, -myg, None)
  301.     else:
  302.         yslice = slice(myg, -myg, None)
  303.  
  304.     selection = dim_dependent_slice(dims=dims, xslice=xslice, yslice=yslice)
  305.     trimmed = fat[selection]
  306.  
  307.     return trimmed
  308.  
  309.  
  310. def dim_dependent_slice(dims, tslice=slice(None,None,None), xslice=slice(None,None,None),
  311.                               yslice=slice(None,None,None), zslice=slice(None,None,None)):
  312.     """
  313.    Returns a multidimensional slice which has the same dimensions as the dims argument.
  314.    """
  315.     if dims == []:
  316.         TypeError('There should not be scalar variables in this part of the code')
  317.  
  318.     selection = []
  319.     if 't' in dims:
  320.         selection.append(tslice)
  321.     if 'x' in dims:
  322.         selection.append(xslice)
  323.     if 'y' in dims:
  324.         selection.append(yslice)
  325.     if 'z' in dims:
  326.         selection.append(zslice)
  327.  
  328.     return selection
  329.  
  330.  
  331. def where_to_insert_dump_contents(i, j, mxsub, mysub, mxg, myg, nxpe, nype, var_dims):
  332.     """
  333.    Decides where in the output NetCDF file the data from the current dump file should be placed.
  334.    """
  335.  
  336.     # Account for guard cells along the x dimension
  337.     if i == 0:
  338.         lower_x = mxsub * i
  339.         upper_x = mxsub * (i + 1) + mxg
  340.     elif i == nxpe - 1:
  341.         lower_x = mxsub * i + mxg
  342.         upper_x = mxsub * (i + 1) + 2 * mxg
  343.     else:
  344.         lower_x = mxsub * i + mxg
  345.         upper_x = mxsub * (i + 1) + mxg
  346.     xinsertion = slice(lower_x, upper_x, None)
  347.  
  348.     # Account for guard cells along the y dimension
  349.     if j == 0:
  350.         lower_y = mysub * j
  351.         upper_y = mysub * (j + 1) + myg
  352.     elif j == nype - 1:
  353.         lower_y = mysub * j + myg
  354.         upper_y = mysub * (j + 1) + 2 * myg
  355.     else:
  356.         lower_y = mysub * j + myg
  357.         upper_y = mysub * (j + 1) + myg
  358.     yinsertion = slice(lower_y, upper_y, None)
  359.  
  360.     insertion_loc = dim_dependent_slice(var_dims, xslice=xinsertion, yslice=yinsertion)
  361.  
  362.     return insertion_loc
Advertisement
Add Comment
Please, Sign In to add comment