Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import numpy as np
- import pandas as pd
- import xarray as xr
- from netCDF4 import Dataset
- import sys
- def combine_boutdata(datadir='.', prefix='/BOUT.dmp.', inpfile='./BOUT.inp', output_file_prefix='boutdata',
- save_fields_separately=False, ignore_fields=None, only_save_fields=None,
- save_grid=True, save_dtype='float32'):
- # TODO allow for reading of multiple different runs in different directories, and return as a list of datasets
- # TODO save information about the directory the data was loaded from
- # Read BOUT.inp file and get nested dictionary of options
- opts = read_options(inpfile)
- # Read first BOUT dump file (e.g. BOUT.dmp.0.nc) to get metadata and info on data variables to save
- first_dump_file = datadir + prefix + '0.nc'
- with xr.open_dataset(first_dump_file) as ds:
- # Read metadata and group variables by type
- metadata, evolving_vars, spatial_vars = read_metadata(ds)
- # Only need to collect non-scalar variables which we care about
- excluded_vars = list(metadata.keys()) + ignore_fields
- if not save_grid:
- grid_variables = ['dx', 'dy', 'J',
- 'g11', 'g22', 'g33', 'g12', 'g13', 'g23',
- 'g_11', 'g_22', 'g_33', 'g_12', 'g_13', 'g_23']
- excluded_vars.extend(grid_variables)
- # Anything which is time-evolving or spatially-dependent should be collected and combined from all dump files
- data_vars_to_save = list((set(evolving_vars) | set(spatial_vars)) - set(excluded_vars))
- print('Will save the fields:\n ' + str(data_vars_to_save))
- dim_sizes, var_dims = get_dimensions(ds, metadata, data_vars_to_save)
- if not save_fields_separately:
- print('Will save all fields...')
- # Combine data from all dump files into one, also saving the data from the input file and the metadata
- savedata_to_nc(dim_sizes=dim_sizes, var_dims=var_dims, metadata=metadata, options=opts,
- to_collect=data_vars_to_save, output_file_prefix=output_file_prefix, save_dtype=save_dtype)
- else:
- # Establish which fields need to be saved to separate files
- if only_save_fields is not None:
- if set(only_save_fields).issubset(evolving_vars):
- fields_to_save_separately = set(only_save_fields).intersection(set(spatial_vars)) - set(excluded_vars)
- else:
- raise ValueError('The options only_save_fields is for specifying which of the time-evolving variables '
- 'you wish to save')
- else:
- fields_to_save_separately = set(evolving_vars).intersection(set(spatial_vars)) - set(excluded_vars)
- print('Will save each of the fields ' + str(fields_to_save_separately) + ' in a separate file')
- for field in fields_to_save_separately:
- print('Saving field ' + field + '...')
- to_save = (set(data_vars_to_save) - fields_to_save_separately).union({field})
- single_output_file_prefix = output_file_prefix + '_' + field
- savedata_to_nc(dim_sizes=dim_sizes, var_dims=var_dims, metadata=metadata, options=opts,
- to_collect=list(to_save), output_file_prefix=single_output_file_prefix,
- save_dtype=save_dtype)
- def read_options(input_file):
- """
- Uses class written by Ben (from BOUT-dev/tools/pylib/boutdata/data.py) to read in tree of all options
- specified in BOUT.inp file, and return them as a nested dictionary.
- """
- from boutdata import data
- import pprint
- print('Reading the BOUT.inp options file:')
- opts = data.BoutOptionsFile(input_file)
- print('Read in options:\n')
- opts_dict = opts.as_dict()
- pprint.pprint(opts_dict)
- return opts_dict
- def read_metadata(ds):
- """
- Reads the file BOUT.dmp.0.nc and returns the t_array as a numpy array, a list of the variables corresponding to
- multidimensional data, and all remaining metadata as an array
- """
- # TODO work out what to do with data which depends on t only but is not t_array
- # Determine which arrays represent what kind of data by examining their dimensions
- variables = list(ds.variables)
- metadata_vars, evolving_vars, spatial_vars = sort_variables_by_dimensions(variables, ds.data_vars)
- # Save metadata as a dictionary so that it can be stored as the attributes of the final DataSet
- # Assumes all the metadata are numpy scalars
- # (which they should be because they have been filtered for t,x,y,z dependence)
- metadata = scalar_vars_to_dict(ds[metadata_vars], metadata_vars)
- print('Found metadata:\n ' + str(metadata))
- return metadata, evolving_vars, spatial_vars
- def sort_variables_by_dimensions(variables, data):
- """
- Determine which arrays represent what kind of data by examining their dimensions
- """
- # All variables must either be scalars (metadata), have t dependence (evolving), or x,y or z dependence (spatial)
- # Variables can and will appear in multiple lists
- evolving_vars = [var for var in variables if 't' in data[var].dims]
- spatial_vars = [var for var in variables if {'x','y','z'}.intersection(data[var].dims)]
- metadata_vars = list(set(variables) - set(evolving_vars) - set(spatial_vars))
- return metadata_vars, evolving_vars, spatial_vars
- def scalar_vars_to_dict(ds, vars):
- """
- Converts all remaining metadata read from the first dump file into a dictionary.
- """
- metadata_keys = list(ds.variables)
- try:
- metadata_values = [np.asscalar(ds[var].values) for var in vars]
- except ValueError('Remaining metadata is assumed to be a scalar value'):
- sys.exit(1)
- metadata = dict(zip(metadata_keys, metadata_values))
- return metadata
- def get_dimensions(ds, metadata, vars_to_collect):
- """
- Returns a dictionary of the overall sizes of each dimension of the dataset, and also a dictionary of the
- dimensions possessed by each variable in the dataset.
- """
- # TODO find out if the assumptions made in this function about guard cells are generally true or not
- dims = ds.dims
- nx = metadata['nx'] # BOUT already includes x guard cells
- ny = metadata['ny'] + 2*metadata['MYG'] # assumes guard cells on both ends of y domain
- nz = dims['z'] # BOUT doesn't parallelize in z
- dim_sizes = {'t': dims['t'], 'x': nx, 'y': ny, 'z': nz}
- print('Overall simulation domain has dimensions:\n ' + str(dim_sizes))
- # Need to know the dimensions of each variable individually in order to allocate space in the output NetCDF file
- vars_dims = {var: ds[var].dims for var in vars_to_collect}
- return dim_sizes, vars_dims
- def savedata_to_nc(dim_sizes, var_dims, metadata, options, to_collect,
- output_file_prefix='boutdata', save_dtype='float32'):
- """
- Collects data from BOUT.dmp.*.nc files in the current directory and saves as a single NetCDF file.
- Also stores the given metadata and options in the attributes dictionary of the NetCDF DataSet.
- """
- attributes = combine_metadata(options, metadata)
- # Read data from BOUT.dmp.*.nc files
- files_to_load_prefix = 'BOUT.dmp.'
- collect_parallelised_data(files_to_load_prefix, output_file_prefix,
- to_collect, dim_sizes, var_dims, attributes,
- nxpe=metadata['NXPE'], mxg=metadata['MXG'], mxsub=metadata['MXSUB'],
- nype=metadata['NYPE'], myg=metadata['MYG'], mysub=metadata['MYSUB'],
- save_dtype=save_dtype)
- def collect_parallelised_data(files_to_load_prefix, output_file_prefix,
- vars_to_collect, dim_sizes, vars_dims, attributes,
- nxpe, nype, mxsub, mysub, mxg, myg,
- save_dtype='float32'):
- # Create output file with full dimensions to fill up later
- output_file = initialise_output_file(output_file_prefix, vars_to_collect, dim_sizes, vars_dims, save_dtype)
- # Collect all data that was split up by parallelisation
- combine_dump_data(files_to_load_prefix, output_file,
- vars_to_collect, vars_dims,
- nxpe, nype, mxsub, mysub, mxg, myg)
- # Add attributes dictionary
- print('\nSaving input options and metadata into attributes dictionary of output file...')
- output_file.setncatts(attributes)
- # Close output file
- print('Finished - closing output file...')
- output_file.close()
- def initialise_output_file(output_file_prefix, variables_to_collect, dim_sizes, variables_dims, save_dtype='float32'):
- print('Creating new NetCDF file to store output in...')
- output = Dataset(output_file_prefix + '.nc', "w", format="NETCDF4")
- print('Allocating memory for variables in output NetCDF file...')
- # dims_sizes should be of whole dataset, e.g. {'t': 14, 'x': 196, 'y': 68, 'z': 128}
- for dim, size in dim_sizes.items():
- output.createDimension(dim, size)
- # Create variables to write to in target NetCDF file
- if save_dtype == 'float32':
- dtype = 'f4'
- elif save_dtype == 'float64':
- dtype = 'f8'
- else:
- dtype = save_dtype
- for variable in variables_to_collect:
- output.createVariable(variable, dtype, variables_dims[variable])
- return output
- def combine_metadata(options, metadata):
- """
- Flatten the options dictionary and combine it with the metadata dictionary, to return a combined attributes dict.
- """
- opts= {'input': options}
- meta = {'meta': metadata}
- attributes = {**meta, **opts}
- # Use a Pandas function to flatten the dictionary
- df = pd.io.json.json_normalize(attributes)
- flat_attrs = df.to_dict(orient='records')[0]
- # Replace '_' separators with ':' to mimic the BOUT options file structure
- flat_attrs_colons = dict(zip([x.replace('.', ':') for x in flat_attrs.keys()], flat_attrs.values()))
- # NetCDF can't save an attributes dictionary where any of the keys contain slash characters, so remove them
- # TODO decide on the best way to deal with keys containing slashes in general
- slashless_attributes = {key.replace('/', ''): val for key, val in flat_attrs_colons.items()}
- return slashless_attributes
- def combine_dump_data(files_to_load_prefix, output_file_handle,
- vars_to_collect, vars_dims,
- nxpe, nype, mxsub, mysub, mxg, myg):
- """
- Given a handle to a correctly set-up output NetCDF file, will collect all distributed variables from BOUT dmp files,
- and write them into the output file.
- Only opens one dmp.*.nc file at a time to minimise RAM usage
- """
- # Loop over files to load from
- for j in range(nype):
- for i in range(nxpe):
- # Open one dump file
- file_num = (i + nxpe * j)
- dmp_file = files_to_load_prefix + str(file_num) + '.nc'
- print('Reading file ' + str(dmp_file) + '...', end='\r')
- dmp_ds = Dataset(dmp_file, "r", format="NETCDF4")
- # Forget about variables which aren't spread across multiple files
- for variable in vars_to_collect:
- var_dims = vars_dims[variable]
- dump_data = dmp_ds.variables[variable][:]
- # Remove ghost cells
- trimmed = remove_ghost(dump_data, i, j, mxg=mxg, myg=myg, nxpe=nxpe, nype=nype, dims=var_dims)
- # Insert chunk into output file
- insertion_loc = where_to_insert_dump_contents(i, j, mxsub, mysub, mxg, myg, nxpe, nype, var_dims)
- output_file_handle.variables[variable][insertion_loc] = trimmed
- # Close dump file
- dmp_ds.close()
- def remove_ghost(fat, i, j, mxg, myg, nxpe, nype, dims):
- """
- Removes all ghost cells from the currently loaded dump file, but leaves the guard cells alone.
- """
- if i == 0:
- xslice = slice(mxg, None, None)
- elif i == nxpe - 1:
- xslice = slice(0, -mxg, None)
- else:
- xslice = slice(mxg, -mxg, None)
- if j == 0:
- yslice = slice(myg, None, None)
- elif j == nype - 1:
- yslice = slice(0, -myg, None)
- else:
- yslice = slice(myg, -myg, None)
- selection = dim_dependent_slice(dims=dims, xslice=xslice, yslice=yslice)
- trimmed = fat[selection]
- return trimmed
- def dim_dependent_slice(dims, tslice=slice(None,None,None), xslice=slice(None,None,None),
- yslice=slice(None,None,None), zslice=slice(None,None,None)):
- """
- Returns a multidimensional slice which has the same dimensions as the dims argument.
- """
- if dims == []:
- TypeError('There should not be scalar variables in this part of the code')
- selection = []
- if 't' in dims:
- selection.append(tslice)
- if 'x' in dims:
- selection.append(xslice)
- if 'y' in dims:
- selection.append(yslice)
- if 'z' in dims:
- selection.append(zslice)
- return selection
- def where_to_insert_dump_contents(i, j, mxsub, mysub, mxg, myg, nxpe, nype, var_dims):
- """
- Decides where in the output NetCDF file the data from the current dump file should be placed.
- """
- # Account for guard cells along the x dimension
- if i == 0:
- lower_x = mxsub * i
- upper_x = mxsub * (i + 1) + mxg
- elif i == nxpe - 1:
- lower_x = mxsub * i + mxg
- upper_x = mxsub * (i + 1) + 2 * mxg
- else:
- lower_x = mxsub * i + mxg
- upper_x = mxsub * (i + 1) + mxg
- xinsertion = slice(lower_x, upper_x, None)
- # Account for guard cells along the y dimension
- if j == 0:
- lower_y = mysub * j
- upper_y = mysub * (j + 1) + myg
- elif j == nype - 1:
- lower_y = mysub * j + myg
- upper_y = mysub * (j + 1) + 2 * myg
- else:
- lower_y = mysub * j + myg
- upper_y = mysub * (j + 1) + myg
- yinsertion = slice(lower_y, upper_y, None)
- insertion_loc = dim_dependent_slice(var_dims, xslice=xinsertion, yslice=yinsertion)
- return insertion_loc
Advertisement
Add Comment
Please, Sign In to add comment