Retrieve and stack Pipe3D datacubes data

#!/usr/bin/env python3

# System imports
import sys
import glob
import numpy as np
import pandas as pd
from astropy.io import fits
from copy import deepcopy as copy
from os.path import basename, isfile, join, isdir

# Local imports
# from pyFIT3D.common.io import get_data_from_fits
# from pyFIT3D.common.tools import read_flux_elines, read_ELINES, read_SSP, read_SFH, read_indices

def read_flux_elines(filename, filename_lines=None, cube=True):
    # _flux_elines, h = get_data_from_fits(filename, header=True)
    t = fits.open(filename)
    h = t[0].header
    _flux_elines = t[0].data
    flux_elines = {}
    units = {}
    desc = {}
    ids = {}
    if cube:
        n = h['NAXIS3']
        nx, ny = h['NAXIS1'], h['NAXIS2']
        _dim = (ny, nx)
    else:
        n = h['NAXIS1']
        ns = h['NAXIS2']
        _dim = (ns)
        _flux_elines =  _flux_elines.T
    wavelengths = np.array([])
    if filename_lines is not None:
        # TODO create lines
        ne = 0
        with open(filename_lines) as fp:
            line = fp.readline()
            while line:
                if not line.startswith('#'):
                    tmp_line = line.strip().split()
                    if len(tmp_line) > 1:
                        wavelengths = np.append(wavelengths, tmp_line[0])
                    else:
                        wavelengths = np.append(wavelengths, 0)
                    ne += 1
                line = fp.readline()
    else:
        if 'WAVE0' not in h.keys():
            print('missing wavelength information.')
            return None
        else:
            for i in range(n):
                wavelengths = np.append(wavelengths, h[f'WAVE{i}'])
    props = np.unique([h[f'NAME{i}'].split(' ')[0] for i in range(n)])
    n_props = len(props)
    _wavelengths = copy(wavelengths)
    wavelengths = np.unique(_wavelengths)
    nw = len(wavelengths)
    for l in wavelengths:
        flux_elines[l] = {}
        units[l] = {}
        desc[l] = {}
        ids[l] = {}
        for p in props:
            flux_elines[l][p] = np.zeros(_dim)
    for i in range(n):
        wl = _wavelengths[i]
        prop_lname = h[f'NAME{i}'].split(' ')
        if len(prop_lname) < 2: continue
        p = prop_lname[0]
        flux_elines[wl][p] = copy(_flux_elines[i])
        ids[wl][p] = i
        if h.get(f'UNIT{i}', None) is not None:
            units[wl][p] = h[f'UNIT{i}']
        else:
            units[wl][p] = None
        if h.get(f'NAME{i}', None) is not None:
            desc[wl][p] = h[f'NAME{i}']
        else:
            desc[wl][p] = None
    del _flux_elines
    return flux_elines, h, ids, desc, units

def read_SSP(filename):
    # _SSP, h = get_data_from_fits(filename, header=True)
    t = fits.open(filename)
    h = t[0].header
    _SSP = t[0].data
    units = {}
    desc = {}
    SSP = {
        'V': copy(_SSP[0]),
        'cont_seg': copy(_SSP[1]),
        'scale_seg': copy(_SSP[2]),
    }
    ids = {
        'V': 0,
        'cont_seg': 1,
        'scale_seg': 2,
    }
    units['V'] = h['UNITS_0'].strip()
    units['cont_seg'] = h['UNITS_1'].strip()
    units['scale_seg'] = h['UNITS_2'].strip()
    desc['V'] = h['DESC_0'].strip()
    desc['cont_seg'] = h['DESC_1'].strip()
    desc['scale_seg'] = h['DESC_2'].strip()
    for i in range(3, h['NAXIS3']):
        _s = h[f'FILE_{i}'].strip().split('.')[2].split('_')
        _tmp = '_'.join(_s[1:-1])
        SSP[_tmp] = copy(_SSP[i])
        ids[_tmp] = i
        units[_tmp] = h[f'UNITS_{i}'].strip()
        desc[_tmp] = h[f'DESC_{i}'].strip()
    del _SSP
    return SSP, h, ids, desc, units

def read_ELINES(filename):
    # _ELINES, h = get_data_from_fits(filename, header=True)
    t = fits.open(filename)
    h = t[0].header
    _ELINES = t[0].data
    n_ELINES = h['NAXIS3']
    units = {}
    desc = {}
    ids = {
        'v_Halpha': 0,
        'disp_Halpha': 1,
    }
    ELINES = {
        'v_Halpha': copy(_ELINES[0]),
        'disp_Halpha': copy(_ELINES[1])
    }
    units['v_Halpha'] = h['UNITS_0'].strip()
    units['disp_Halpha'] = h['UNITS_1'].strip()
    desc['v_Halpha'] = h['DESC_0'].strip()
    desc['disp_Halpha'] = h['DESC_1'].strip()
    for i in range(2, n_ELINES):
        _tmp = h[f'DESC_{i}'].strip().split(' ')[0]
        ELINES[_tmp] = copy(_ELINES[i])
        units[_tmp] = h[f'UNITS_{i}'].strip()
        desc[_tmp] = h[f'DESC_{i}'].strip()
        ids[_tmp] = i
    del _ELINES
    return ELINES, h, ids, desc, units

def read_SFH(filename):
    # _SFH, h = get_data_from_fits(filename, header=True)
    t = fits.open(filename)
    h = t[0].header
    _SFH = t[0].data
    name = h['OBJECT'].strip()
    n_models = len([i for i in range(h['NAXIS3']) if f'{name}_NORM' in h[f'FILE_{i}']])
    mets_str = [
        h[f'FILE_{i}'].split('_')[1]
        for i in range(h['NAXIS3']) if 'NORM_met' in h[f'FILE_{i}']
    ]
    ages_str = [
        h[f'FILE_{i}'].split('_')[1]
        for i in range(h['NAXIS3']) if 'NORM_age' in h[f'FILE_{i}']
    ]
    age_models = np.asarray([eval(x) for x in ages_str])
    met_models = np.asarray([eval(x) for x in mets_str])
    n_age = len(age_models)
    n_met = len(met_models)
    n_models = n_models
    age_met_models = [
         h[f'DESC_{i}'].strip('Luminosity Fraction for age-met ').strip(' SSP').split('-')
         for i in range(h['NAXIS3']) if f'{name}_NORM' in h[f'FILE_{i}']
    ]
    age_met_models = copy(age_met_models)
    age_met_models_index = []
    for age, met in age_met_models:
        i_age = np.arange(n_age)[eval(age) == age_models]
        i_met = np.arange(n_met)[eval(met) == met_models]
        age_met_models_index.append((i_age, i_met))
    nx, ny = h['NAXIS1'], h['NAXIS2']
    SFH = {
        'models': np.zeros((n_models, ny, nx)),
        'age': np.zeros((n_age, ny, nx)),
        'met': np.zeros((n_met, ny, nx))
    }
    units = {
        'models': [],
        'age': [],
        'met': []
    }
    desc = {
        'models': [],
        'age': [],
        'met': []
    }
    ids = {
        'models': [],
        'age': [],
        'met': []
    }
    j, k, l = 0, 0, 0
    for i in range(h['NAXIS3']):
        _id = h[f'ID_{i}'] - 1
        val = _SFH[_id]
        try:
            _desc = h[f'DESC_{i}']
        except KeyError:
            _desc = None
        try:
            _units = h['UNITS_{i}']
        except KeyError:
            _units = 'fraction'
        if f'{name}_NORM_{_id}' in h[f'FILE_{i}']:
            SFH['models'][j] = copy(val)
            units['models'].append(_units)
            desc['models'].append(_desc)
            ids['models'].append(i)
            j += 1
        elif 'NORM_age' in h[f'FILE_{i}']:
            SFH['age'][k] = copy(val)
            units['age'].append(_units)
            desc['age'].append(_desc)
            ids['age'].append(i)
            k += 1
        elif 'NORM_met' in h[f'FILE_{i}']:
            SFH['met'][l] = copy(val)
            units['met'].append(_units)
            desc['met'].append(_desc)
            ids['met'].append(i)
            l += 1
    del _SFH
    models_metainfo = [age_met_models, age_met_models_index, age_models, met_models]
    return SFH, h, ids, desc, units, models_metainfo

def read_indices(filename):
    # _indices, h = get_data_from_fits(filename, header=True)
    t = fits.open(filename)
    h = t[0].header
    _indices = t[0].data
    indices = {}
    units = None
    desc = None
    ids = {}
    n = h['NAXIS3']
    nx, ny = h['NAXIS1'], h['NAXIS2']
    for i in range(n):
        desc = h[f'INDEX{i}']
        indices[desc] = copy(_indices[i])
        ids[desc] = i
    del _indices
    return indices, h, ids, desc, units

def iter_CALIFA_gals_from_datacubes(workdir, list_gals=None):
    """
    Iterator by galaxy names from CALIFA datacubes workdir.

    Parameters
    ----------
    workdir : str
        Datacubes directory.

    list_gals : str or None
        CSV o single column file with a list of galaxies. If the file `list_gals`
        is a CSV file it will read the first column only (usable to input
        get_proc_elines.csv as a list of galaxies). If it is None, the function
        will generate the list of galaxies based on all SSP datacubes filenames
        inside `workdir`. Defaults to None.

    Yield
    -----
        [str, str] :
            The workdir and the galaxy name.
    """
    if list_gals is not None:
        gals = np.genfromtxt(list_gals, usecols=[0], comments='#', delimiter=',', dtype='str')
        gals = gals.tolist()
    else:
        gals = []
        for _f in glob.glob(join(workdir, '*.SSP.cube.fits.gz')):
            gals.append(_f.split('.')[0])
    for g in gals:
        yield workdir, g

def iter_MaNGA_gals_from_datacubes(workdir, list_gals=None):
    """
    Iterator by galaxy names from MaNGA datacubes workdir. It differs from
    `iter_CALIFA_gals_from_datacubes` because of the hierarchy of MaNGA
    directories (the datacubes are divided in workdir/plate/ifu/*)

    Parameters
    ----------
    workdir : str
        Datacubes directory.

    list_gals : str or None
        CSV o single column file with a list of galaxies. If the file `list_gals`
        is a CSV file it will read the first column only (usable to input
        get_proc_elines.csv as a list of galaxies). If it is None, the function
        will generate the list of galaxies based on all SSP datacubes filenames
        inside `workdir`. Defaults to None.

    Yield
    -----
        [str, str] :
            The workdir (i.e. a join from `workdir` and the plate and ifu from
            the galaxy name) and the galaxy name.
    """
    if list_gals is not None:
        gals = np.genfromtxt(list_gals, usecols=[0], comments='#', delimiter=',', dtype='str')
        gals = gals.tolist()
    else:
        gals = []
        for wd_plate in filter(isdir, glob.glob(join(workdir, '*'))):
            for wd_ifu in filter(isdir, glob.glob(join(wd_plate, '*'))):
                for _f in glob.glob(join(wd_ifu, '*.SSP.cube.fits.gz')):
                    gals.append(basename(_f).split('.')[0])
    for g in gals:
        _split = g.split('-')
        plate = _split[1]
        ifu = _split[2]
        yield join(workdir, join(plate, ifu)), g

if __name__ == '__main__':
    ELINES_file = '{}.ELINES.cube.fits.gz'
    flux_elines_file = 'flux_elines.{}.cube.fits.gz'
    SSP_file = '{}.SSP.cube.fits.gz'
    SFH_file = '{}.SFH.cube.fits.gz'
    indices_file = 'indices.CS.{}.cube.fits.gz'
    mask_file = 'mask.{}.g.fits.gz'

    # SOME FLUX ELINES KEYS
    fek = {'Ha': '6562.68', 'NII6548': '6548.08', 'NII6583': '6583.41',
           '[OIII]': '5006.84', '[OII]': '3727.4', 'Hb': '4861.32'}

    # SOME ELINES KEYS
    Ek = {'Ha': 'Halpha', 'NII6583': '[NII]6583', 'NII6548': '[NII]6548'}

    df = []

    ##############################################################
    # FOR MANGA DATACUBES UNCOMMENT AND EDIT THE FOLLOWING LINES

    # workdir = '/mnt/NASfilemon/MaNGA/v3_1_1/out_newver'
    # list_gals = None
    # for _wdir, g in iter_MaNGA_gals_from_datacubes(workdir=workdir, list_gals=list_gals):

    ##############################################################


    ##############################################################
    # FOR CALIFA DATACUBES UNCOMMENT AND EDIT THE FOLLOWING LINES

    workdir = '/mnt/NASfilemon/disk-d/sanchez/ppak/legacy/DATA/SSP/analysis_v2.2_MaStars_sLOG/dataproducts'
    list_gals = '/mnt/NASfilemon/califa/get_proc_elines_CALIFA.all_good.csv'
    for _wdir, g in iter_CALIFA_gals_from_datacubes(workdir=workdir, list_gals=list_gals):

    ##############################################################

        _fE = join(_wdir, flux_elines_file.format(g))
        _ffe = join(_wdir, ELINES_file.format(g))
        _fSSP = join(_wdir, SSP_file.format(g))
        _fSFH = join(_wdir, SFH_file.format(g))
        _find = join(_wdir, indices_file.format(g))
        _fmsk = join(_wdir, mask_file.format(g))

        if not isfile(_fmsk):
            print(f'{basename(sys.argv[0])}: {g}: {_fmsk}: missing mask file')
            _mask = None
        else:
            # _mask = get_data_from_fits(_fmsk)
            _mask = fits.open(_fmsk)[0].data

        if not (isfile(_fE) and isfile(_ffe) and isfile(_fSSP) and isfile(_fSFH) and isfile(_find)):
            print(f'{basename(sys.argv[0])}: {g}: missing cubes')
            continue

        # read cubes
        r = read_flux_elines(_fE)
        _flux_elines, fe_h, fe_ids, fe_desc, fe_units = r
        r = read_ELINES(_ffe)
        _ELINES, E_h, E_ids, E_desc, E_units = r
        r = read_SSP(_fSSP)
        _SSP, SSP_h, SSP_ids, SSP_desc, SSP_units = r
        _V = _SSP['V']
        r = read_SFH(_fSFH)
        _SFH, SFH_h, SFH_ids, SFH_desc, SFH_units, SFH_metainfo = r
        r = read_indices(_find)
        _ind, ind_h, ind_ids, ind_desc, ind_units = r

        V = _V.flatten().tolist()
        name = [g]*len(V)

        if _mask is None:
            mask = np.ones_like(V)
        else:
            mask = _mask.flatten().tolist()

        # HERE YOU HAVE TO BUILD YOUR OWN DATAFRAME WITH THE NEEDED DATA FLATTENED
        df.append(
            pd.DataFrame(
                {
                    # NAME TAG
                    'name': name,

                    # V-BAND MAP
                    'V': V,

                    # stellar mask used in CALIFA
                    'mask': mask,

                    'fedOII': _flux_elines[fek['[OII]']]['disp'].flatten().tolist(),
                    'fedOIII': _flux_elines[fek['[OIII]']]['disp'].flatten().tolist(),
                    'fedOII_err': _flux_elines[fek['[OII]']]['e_disp'].flatten().tolist(),
                    'fedOIII_err': _flux_elines[fek['[OIII]']]['e_disp'].flatten().tolist(),

                    #### HA DATA FROM ELINES AND FLUX_ELINES ####
                    # 'EfHa': _ELINES[Ek['Ha']].flatten().tolist(),
                    # 'EvHa': _ELINES[f"v_{Ek['Ha']}"].flatten().tolist(),
                    # 'EdHa': _ELINES[f"disp_{Ek['Ha']}"].flatten().tolist(),
                    # 'fefHa': _flux_elines[fek['Ha']]['flux'].flatten().tolist(),
                    # 'fevHa': _flux_elines[fek['Ha']]['vel'].flatten().tolist(),
                    # 'fedHa': _flux_elines[fek['Ha']]['disp'].flatten().tolist(),
                    #############################################

                    #### [NII] DATA FROM ELINES AND FLUX_ELINES ####
                    # 'EfNII6583': _ELINES[Ek['NII6583']].flatten().tolist(),
                    # 'EfNII6548': _ELINES[Ek['NII6548']].flatten().tolist(),
                    # 'fefNII6583': _flux_elines[fek['NII6583']]['flux'].flatten().tolist(),
                    # 'fefNII6548': _flux_elines[fek['NII6583']]['flux'].flatten().tolist(),
                    ################################################

                }
            )
        )

    # generate final dataframe and outputs in a CSV file
    df = pd.concat(df, ignore_index=True)
    df.to_csv('datacubes_retrieve.csv')