Combine multiple netcdf files

#!/usr/bin/env python

import numpy as np
import pandas as pd
import xarray as xr
from netCDF4 import Dataset
import sys


def combine_boutdata(datadir='.', prefix='/BOUT.dmp.', inpfile='./BOUT.inp', output_file_prefix='boutdata',
                     save_fields_separately=False, ignore_fields=None, only_save_fields=None,
                     save_grid=True, save_dtype='float32'):

    # TODO allow for reading of multiple different runs in different directories, and return as a list of datasets
    # TODO save information about the directory the data was loaded from

    # Read BOUT.inp file and get nested dictionary of options
    opts = read_options(inpfile)

    # Read first BOUT dump file (e.g. BOUT.dmp.0.nc) to get metadata and info on data variables to save
    first_dump_file = datadir + prefix + '0.nc'
    with xr.open_dataset(first_dump_file) as ds:

        # Read metadata and group variables by type
        metadata, evolving_vars, spatial_vars = read_metadata(ds)

        # Only need to collect non-scalar variables which we care about
        excluded_vars = list(metadata.keys()) + ignore_fields
        if not save_grid:
            grid_variables = ['dx', 'dy', 'J',
                              'g11',  'g22',  'g33',  'g12',  'g13',  'g23',
                              'g_11', 'g_22', 'g_33', 'g_12', 'g_13', 'g_23']
            excluded_vars.extend(grid_variables)

        # Anything which is time-evolving or spatially-dependent should be collected and combined from all dump files
        data_vars_to_save = list((set(evolving_vars) | set(spatial_vars)) - set(excluded_vars))
        print('Will save the fields:\n    ' + str(data_vars_to_save))

        dim_sizes, var_dims = get_dimensions(ds, metadata, data_vars_to_save)

    if not save_fields_separately:
        print('Will save all fields...')
        # Combine data from all dump files into one, also saving the data from the input file and the metadata
        savedata_to_nc(dim_sizes=dim_sizes, var_dims=var_dims, metadata=metadata, options=opts,
                       to_collect=data_vars_to_save, output_file_prefix=output_file_prefix, save_dtype=save_dtype)

    else:
        # Establish which fields need to be saved to separate files
        if only_save_fields is not None:
            if set(only_save_fields).issubset(evolving_vars):
                fields_to_save_separately = set(only_save_fields).intersection(set(spatial_vars)) - set(excluded_vars)
            else:
                raise ValueError('The options only_save_fields is for specifying which of the time-evolving variables '
                                 'you wish to save')
        else:
            fields_to_save_separately = set(evolving_vars).intersection(set(spatial_vars)) - set(excluded_vars)
        print('Will save each of the fields ' + str(fields_to_save_separately) + ' in a separate file')

        for field in fields_to_save_separately:
            print('Saving field ' + field + '...')
            to_save = (set(data_vars_to_save) - fields_to_save_separately).union({field})

            single_output_file_prefix = output_file_prefix + '_' + field
            savedata_to_nc(dim_sizes=dim_sizes, var_dims=var_dims, metadata=metadata, options=opts,
                           to_collect=list(to_save), output_file_prefix=single_output_file_prefix,
                           save_dtype=save_dtype)


def read_options(input_file):
    """
    Uses class written by Ben (from BOUT-dev/tools/pylib/boutdata/data.py) to read in tree of all options
    specified in BOUT.inp file, and return them as a nested dictionary.
    """

    from boutdata import data
    import pprint

    print('Reading the BOUT.inp options file:')
    opts = data.BoutOptionsFile(input_file)

    print('Read in options:\n')
    opts_dict = opts.as_dict()
    pprint.pprint(opts_dict)

    return opts_dict


def read_metadata(ds):
    """
    Reads the file BOUT.dmp.0.nc and returns the t_array as a numpy array, a list of the variables corresponding to
    multidimensional data, and all remaining metadata as an array
    """

    # TODO work out what to do with data which depends on t only but is not t_array

    # Determine which arrays represent what kind of data by examining their dimensions
    variables = list(ds.variables)
    metadata_vars, evolving_vars, spatial_vars = sort_variables_by_dimensions(variables, ds.data_vars)

    # Save metadata as a dictionary so that it can be stored as the attributes of the final DataSet
    # Assumes all the metadata are numpy scalars
    # (which they should be because they have been filtered for t,x,y,z dependence)
    metadata = scalar_vars_to_dict(ds[metadata_vars], metadata_vars)
    print('Found metadata:\n    ' + str(metadata))

    return metadata, evolving_vars, spatial_vars


def sort_variables_by_dimensions(variables, data):
    """
    Determine which arrays represent what kind of data by examining their dimensions
    """

    # All variables must either be scalars (metadata), have t dependence (evolving), or x,y or z dependence (spatial)
    # Variables can and will appear in multiple lists
    evolving_vars = [var for var in variables if 't' in data[var].dims]
    spatial_vars = [var for var in variables if {'x','y','z'}.intersection(data[var].dims)]
    metadata_vars = list(set(variables) - set(evolving_vars) - set(spatial_vars))

    return metadata_vars, evolving_vars, spatial_vars


def scalar_vars_to_dict(ds, vars):
    """
    Converts all remaining metadata read from the first dump file into a dictionary.
    """

    metadata_keys = list(ds.variables)
    try:
        metadata_values = [np.asscalar(ds[var].values) for var in vars]
    except ValueError('Remaining metadata is assumed to be a scalar value'):
        sys.exit(1)
    metadata = dict(zip(metadata_keys, metadata_values))

    return metadata


def get_dimensions(ds, metadata, vars_to_collect):
    """
    Returns a dictionary of the overall sizes of each dimension of the dataset, and also a dictionary of the
    dimensions possessed by each variable in the dataset.
    """

    # TODO find out if the assumptions made in this function about guard cells are generally true or not

    dims = ds.dims
    nx = metadata['nx']  # BOUT already includes x guard cells
    ny = metadata['ny'] + 2*metadata['MYG']  # assumes guard cells on both ends of y domain
    nz = dims['z']  # BOUT doesn't parallelize in z
    dim_sizes = {'t': dims['t'], 'x': nx, 'y': ny, 'z': nz}
    print('Overall simulation domain has dimensions:\n    ' + str(dim_sizes))

    # Need to know the dimensions of each variable individually in order to allocate space in the output NetCDF file
    vars_dims = {var: ds[var].dims for var in vars_to_collect}

    return dim_sizes, vars_dims


def savedata_to_nc(dim_sizes, var_dims, metadata, options, to_collect,
                   output_file_prefix='boutdata', save_dtype='float32'):
    """
    Collects data from BOUT.dmp.*.nc files in the current directory and saves as a single NetCDF file.

    Also stores the given metadata and options in the attributes dictionary of the NetCDF DataSet.
    """

    attributes = combine_metadata(options, metadata)

    # Read data from BOUT.dmp.*.nc files
    files_to_load_prefix = 'BOUT.dmp.'

    collect_parallelised_data(files_to_load_prefix, output_file_prefix,
                              to_collect, dim_sizes, var_dims, attributes,
                              nxpe=metadata['NXPE'], mxg=metadata['MXG'], mxsub=metadata['MXSUB'],
                              nype=metadata['NYPE'], myg=metadata['MYG'], mysub=metadata['MYSUB'],
                              save_dtype=save_dtype)


def collect_parallelised_data(files_to_load_prefix, output_file_prefix,
                              vars_to_collect, dim_sizes, vars_dims,  attributes,
                              nxpe, nype, mxsub, mysub, mxg, myg,
                              save_dtype='float32'):

    # Create output file with full dimensions to fill up later
    output_file = initialise_output_file(output_file_prefix, vars_to_collect, dim_sizes, vars_dims, save_dtype)

    # Collect all data that was split up by parallelisation
    combine_dump_data(files_to_load_prefix, output_file,
                       vars_to_collect, vars_dims,
                       nxpe, nype, mxsub, mysub, mxg, myg)

    # Add attributes dictionary
    print('\nSaving input options and metadata into attributes dictionary of output file...')
    output_file.setncatts(attributes)

    # Close output file
    print('Finished - closing output file...')
    output_file.close()


def initialise_output_file(output_file_prefix, variables_to_collect, dim_sizes, variables_dims, save_dtype='float32'):

    print('Creating new NetCDF file to store output in...')
    output = Dataset(output_file_prefix + '.nc', "w", format="NETCDF4")

    print('Allocating memory for variables in output NetCDF file...')
    # dims_sizes should be of whole dataset, e.g. {'t': 14, 'x': 196, 'y': 68, 'z': 128}
    for dim, size in dim_sizes.items():
        output.createDimension(dim, size)

    # Create variables to write to in target NetCDF file
    if save_dtype == 'float32':
        dtype = 'f4'
    elif save_dtype == 'float64':
        dtype = 'f8'
    else:
        dtype = save_dtype
    for variable in variables_to_collect:
        output.createVariable(variable, dtype, variables_dims[variable])

    return output


def combine_metadata(options, metadata):
    """
    Flatten the options dictionary and combine it with the metadata dictionary, to return a combined attributes dict.
    """

    opts= {'input': options}

    meta = {'meta': metadata}
    attributes = {**meta, **opts}

    # Use a Pandas function to flatten the dictionary
    df = pd.io.json.json_normalize(attributes)
    flat_attrs = df.to_dict(orient='records')[0]

    # Replace '_' separators with ':' to mimic the BOUT options file structure
    flat_attrs_colons = dict(zip([x.replace('.', ':') for x in flat_attrs.keys()], flat_attrs.values()))

    # NetCDF can't save an attributes dictionary where any of the keys contain slash characters, so remove them
    # TODO decide on the best way to deal with keys containing slashes in general
    slashless_attributes = {key.replace('/', ''): val for key, val in flat_attrs_colons.items()}

    return slashless_attributes


def combine_dump_data(files_to_load_prefix, output_file_handle,
                      vars_to_collect, vars_dims,
                      nxpe, nype, mxsub, mysub, mxg, myg):
    """
    Given a handle to a correctly set-up output NetCDF file, will collect all distributed variables from BOUT dmp files,
    and write them into the output file.

    Only opens one dmp.*.nc file at a time to minimise RAM usage
    """

    # Loop over files to load from
    for j in range(nype):
        for i in range(nxpe):

            # Open one dump file
            file_num = (i + nxpe * j)
            dmp_file = files_to_load_prefix + str(file_num) + '.nc'
            print('Reading file ' + str(dmp_file) + '...', end='\r')
            dmp_ds = Dataset(dmp_file, "r", format="NETCDF4")

            # Forget about variables which aren't spread across multiple files
            for variable in vars_to_collect:
                var_dims = vars_dims[variable]

                dump_data = dmp_ds.variables[variable][:]

                # Remove ghost cells
                trimmed = remove_ghost(dump_data, i, j, mxg=mxg, myg=myg, nxpe=nxpe, nype=nype, dims=var_dims)

                # Insert chunk into output file
                insertion_loc = where_to_insert_dump_contents(i, j, mxsub, mysub, mxg, myg, nxpe, nype, var_dims)
                output_file_handle.variables[variable][insertion_loc] = trimmed

            # Close dump file
            dmp_ds.close()


def remove_ghost(fat, i, j, mxg, myg, nxpe, nype, dims):
    """
    Removes all ghost cells from the currently loaded dump file, but leaves the guard cells alone.
    """

    if i == 0:
        xslice = slice(mxg, None, None)
    elif i == nxpe - 1:
        xslice = slice(0, -mxg, None)
    else:
        xslice = slice(mxg, -mxg, None)

    if j == 0:
        yslice = slice(myg, None, None)
    elif j == nype - 1:
        yslice = slice(0, -myg, None)
    else:
        yslice = slice(myg, -myg, None)

    selection = dim_dependent_slice(dims=dims, xslice=xslice, yslice=yslice)
    trimmed = fat[selection]

    return trimmed


def dim_dependent_slice(dims, tslice=slice(None,None,None), xslice=slice(None,None,None),
                              yslice=slice(None,None,None), zslice=slice(None,None,None)):
    """
    Returns a multidimensional slice which has the same dimensions as the dims argument.
    """
    if dims == []:
        TypeError('There should not be scalar variables in this part of the code')

    selection = []
    if 't' in dims:
        selection.append(tslice)
    if 'x' in dims:
        selection.append(xslice)
    if 'y' in dims:
        selection.append(yslice)
    if 'z' in dims:
        selection.append(zslice)

    return selection


def where_to_insert_dump_contents(i, j, mxsub, mysub, mxg, myg, nxpe, nype, var_dims):
    """
    Decides where in the output NetCDF file the data from the current dump file should be placed.
    """

    # Account for guard cells along the x dimension
    if i == 0:
        lower_x = mxsub * i
        upper_x = mxsub * (i + 1) + mxg
    elif i == nxpe - 1:
        lower_x = mxsub * i + mxg
        upper_x = mxsub * (i + 1) + 2 * mxg
    else:
        lower_x = mxsub * i + mxg
        upper_x = mxsub * (i + 1) + mxg
    xinsertion = slice(lower_x, upper_x, None)

    # Account for guard cells along the y dimension
    if j == 0:
        lower_y = mysub * j
        upper_y = mysub * (j + 1) + myg
    elif j == nype - 1:
        lower_y = mysub * j + myg
        upper_y = mysub * (j + 1) + 2 * myg
    else:
        lower_y = mysub * j + myg
        upper_y = mysub * (j + 1) + myg
    yinsertion = slice(lower_y, upper_y, None)

    insertion_loc = dim_dependent_slice(var_dims, xslice=xinsertion, yslice=yinsertion)

    return insertion_loc