Source code for uchuutools.ctrees_utils

# -*- coding: utf-8 -*-

"""
Utilities for Consistent-Trees Catalogues
=========================================
Provides several useful utility functions that work with tree catalogues
generated by Consistent-Trees.
"""

__author__ = "Manodeep Sinha"
__all__ = ("read_locations_and_forests", "get_aggregate_forest_info",
           "get_all_parallel_ctrees_filenames",
           "check_forests_locations_filenames",
           "validate_inputs_are_ctrees_files",
           "get_treewalk_dtype_descr", "add_tree_walk_indices", )

import time

from .utils import generic_reader, check_and_decompress, get_metadata


# requires numpy >= 1.9.0
[docs]def read_locations_and_forests(forests_fname, locations_fname, rank=0):
    """
    Returns a numpy structured array that contains *both* the forest and
    tree level info.

    Parameters
    ----------

    forests_fname: string, required
        The name of the file containing forest-level info like the
        Consistent-Trees 'forests.list' file

    locations_fname: string, required
        The name of the file containing tree-level info like the
        Consistent-Trees 'locations.dat' file

    rank: integer, optional, default:0
        An integer identifying which task is calling this function. Only
        used in status messages

    Returns
    -------
    trees_and_locations: A numpy structured array
        A numpy structured array containing the fields
        <TreeRootID ForestID Filename FileID Offset TreeNbytes>
        The array is sorted by ``(ForestID, Filename, Offset)`` in that order.
        This sorting means that *all* trees belonging to the same forest
        *will* appear consecutively regardless of the file that the
        corresponding tree data might appear in. The number of elements
        in the array is equal to the number of trees.

        Note: Sorting by ``Filename`` is implemented by an equivalent, but
        faster sorting on ``FileID``.

    """
    import os  # use os.stat to get filesize
    import numpy as np
    import numpy.lib.recfunctions as rfn

    def _guess_dtype_from_val(val):
        converters_and_type = [(np.float, np.float),
                               (str, 'U1024')]
        fallback_dtype = 'V'
        for (conv, dtype) in converters_and_type:
            try:
                _ = conv(val)
                return dtype
            except ValueError:
                pass

        print(f"Warning: Could not guess datatype for val = '{val}'. "
              f"Returning datatype = '{fallback_dtype}'")
        return fallback_dtype

    def _get_dtype_from_header(fname):
        with generic_reader(fname, 'r') as f:
            f = iter(f)
            hdr = next(f)
            hdr = hdr.strip('#').rstrip()
            dataline = next(f)

        known_fields = {
            'Offset': np.int64,
            'Filename': 'U512',
        }

        # Now split on whitespace
        fields = hdr.split()
        datavals = dataline.split()
        fields_and_type = [None]*len(fields)
        for ii, (fld, val) in enumerate(zip(fields, datavals)):
            if 'ID' in fld.upper():
                dtype = np.int64
            else:
                try:
                    dtype = known_fields[fld]
                except KeyError:
                    dtype = _guess_dtype_from_val(val)
                    print(f"Warning: Did not find key = '{fld}' in the known "
                          f"columns. Assuming '{dtype}'")
                    pass

            fields_and_type[ii] = (fld, dtype)

        return np.dtype(fields_and_type)

    # The main `read_locations_and_forests` function begins here
    forests_dtype = _get_dtype_from_header(forests_fname)
    locations_dtype = _get_dtype_from_header(locations_fname)

    # Check that both the dtypes have the 'common' field that
    # we are later going to use to join the two files
    join_key = 'TreeRootID'
    if join_key not in forests_dtype.names or \
       join_key not in locations_dtype.names:
        msg = f"Error: Expected to find column = '{join_key}' in "\
              f"both the forests file ('{forests_fname}') and "\
              f"the locations file ('{locations_fname}')"
        msg += f"Columns in forests file = {forests_dtype.names}"
        msg += f"Columns in locations file = {locations_dtype.names}"
        raise KeyError(msg)

    t0 = time.perf_counter()
    print(f"[Rank={rank}]: Reading forests file '{forests_fname}'")
    forests = np.loadtxt(forests_fname, comments='#', dtype=forests_dtype)
    t1 = time.perf_counter()
    print(f"[Rank={rank}]: Reading forests file '{forests_fname}'...done. "
          f"Time taken = {t1-t0:.2f} seconds")

    t0 = time.perf_counter()
    print(f"[Rank={rank}]: Reading locations file '{locations_fname}' ...")
    locations = np.loadtxt(locations_fname, comments='#',
                           dtype=locations_dtype)
    t1 = time.perf_counter()
    print(f"[Rank={rank}]: Reading locations file '{locations_fname}' "
          f"...done. Time taken = {t1-t0:.2f} seconds")

    # Check that the base directory is the same
    # for the 'forests.list' and 'locations.dat' files
    # Otherwise, since the code the code logic will fail in the conversion
    dirname = list(set([os.path.dirname(f) for f in [forests_fname,
                                                     locations_fname]]))
    if len(dirname) != 1:
        msg = "Error: Standard Consistent-Trees catalogs *should* "\
              "create the 'forests.list' and 'locations.dat' in the "\
              f"same directory. Instead found directories = {dirname}\n"\
              f"Input files were = ({forests_fname}, {locations_fname})"
        raise ValueError(msg)

    dirname = dirname[0]
    # The 'locations.dat' file does not have fully-qualified paths
    # Add the prefix so all future queries woork out just fine
    filenames = [f'{dirname}/{fname}' for fname in locations['Filename']
                 if '/' not in fname]
    locations['Filename'][:] = filenames

    t0 = time.perf_counter()
    print(f"[Rank={rank}]: Joining the forests and locations arrays...")
    trees_and_locations = rfn.join_by(join_key, forests, locations,
                                      jointype='inner')
    if trees_and_locations.shape[0] != forests.shape[0]:
        raise AssertionError("Error: Inner join failed to preserve the shape")

    ntrees = trees_and_locations.shape[0]
    treenbytes = np.zeros(ntrees, dtype=np.int64)
    trees_and_locations = rfn.append_fields(trees_and_locations,
                                            'TreeNbytes',
                                            treenbytes)
    t1 = time.perf_counter()
    print(f"[Rank={rank}]: Joining the forests and locations arrays...done. "
          f"Time taken = {t1-t0:.2f} seconds")

    # All the fields have been assigned into the combined structure
    # Now, we need to figure out the bytes size of each forest by sorting on
    # FileID and Offset. The locations.dat file contains the offset where
    # the tree data starts but does not contain where the tree data ends. By
    # sorting on ('FileID', 'Offset'), we can have a guess at where the current
    # tree ends by looking at where the next tree starts, and then removing all
    # intervening bytes. Here, the intervening bytes are what's written out by
    # this line -> https://bitbucket.org/pbehroozi/consistent-trees/src/9668e1f4d396dcecdcd1afab6ac0c80cbd561d72/src/assemble_halo_trees.c#lines-202
    #
    #   fprintf(tree_outputs[file_id], "#tree %"PRId64"\n", halos[0].id);
    #
    # Since we already have the TreeRootID, we know *exactly* how many bytes
    # are taken up by that printf statement. We subtract that many bytes from
    # the next-tree-starting-offset, then we have the
    # current-tree-ending-offset. The last tree in the file will have to
    # special-cased -- in this case, the end of tree data is known to
    # be at end-of-file (EOF). -- MS 16/04/2020

    # Sort the array on filename, and then by offset
    # Sorting by fileID is faster than sorting by filename
    trees_and_locations.sort(order=['FileID', 'Offset'])

    # Calculate the number of bytes in every tree
    t0 = time.perf_counter()
    print(f"[Rank={rank}]: Computing number of bytes per tree...")
    nexttid = trees_and_locations[join_key][1:]
    nexttidlen = np.array([len(f"#tree {x:d}\n") for x in nexttid],
                          dtype=np.int64)
    nextstart = trees_and_locations['Offset'][1:]
    thisend = nextstart - nexttidlen
    treenbytes[0:-1] = thisend - trees_and_locations['Offset'][0:-1]

    # Now fix the last tree for any file, (special-case the absolute last tree)
    uniq_tree_filenames = list(set(trees_and_locations['Filename']))
    filesizes_dict = {fname: os.stat(fname).st_size
                      for fname in uniq_tree_filenames}

    fileid_changes_ind = [ntrees-1]
    if len(uniq_tree_filenames) > 1:
        fileid = trees_and_locations['FileID'][:]
        fileid_diffs = np.diff(fileid)
        ind = (np.where(fileid_diffs != 0))[0]
        if ind:
            fileid_changes_ind.extend(ind)

    for ii in fileid_changes_ind:
        fname = trees_and_locations['Filename'][ii]
        off = trees_and_locations['Offset'][ii]
        treenbytes[ii] = filesizes_dict[fname] - off

    if treenbytes.min() <= 0:
        msg = "Error: Trees must span more than 0 bytes. "\
             f"Found treenbytes.min() = {treenbytes.min()}"
        raise AssertionError(msg)

    trees_and_locations['TreeNbytes'][:] = treenbytes
    t1 = time.perf_counter()
    print(f"[Rank={rank}]: Computing number of bytes per tree...done. "
          f"Time taken = {t1-t0:.2f} seconds")

    # Now every tree has an associated size in bytes
    # Now let's re-sort the trees such that all trees from
    # the same forest are contiguous in the array.
    trees_and_locations.sort(order=['ForestID', 'Filename', 'Offset'],
                             kind='heapsort')

    return trees_and_locations


[docs]def get_aggregate_forest_info(trees_and_locations, rank=0):
    """
    Returns forest-level information from the tree-level information
    supplied.

    Parameters
    -----------

    trees_and_locations: numpy structured array, required
        A numpy structured array that contains the tree-level
        information. Should be the output from the function
        :func:`read_locations_and_forests`

    rank: integer, optional, default:0
        An integer identifying which task is calling this function. Only
        used in status messages

    Returns
    -------
    forest_info: A numpy structured array
        The structured array contains the fields
        ``['ForestID', 'ForestNhalos', 'Input_ForestNbytes', 'Ntrees']``
        The 'ForestNhalos' field is set to 0, and is populated as the
        trees are processed. The number of elements in the array is
        equal to the number of forests.

    """

    import numpy as np

    # The smallest granularity for parallelisation is a
    # single forest --> therefore, we also create this
    # additional array with info at the forest level.

    ntrees = trees_and_locations.shape[0]
    # The strategy requires the 'trees_and_locations' to be
    # already sorted on 'ForestID'
    uniq_forestids, ntrees_per_forest = \
        np.unique(trees_and_locations['ForestID'],
                  return_counts=True)
    nforests = uniq_forestids.shape[0]
    forest_info_dtype = np.dtype([('ForestID', np.int64),
                                  ('ForestNhalos', np.int64),
                                  ('Input_ForestNbytes', np.int64),
                                  ('Ntrees', np.int64)])

    forest_info = np.empty(nforests, dtype=forest_info_dtype)
    forest_info['ForestID'][:] = uniq_forestids

    # We can only fill in the number of halos *after* reading in
    # the halos (across all the trees)
    forest_info['ForestNhalos'][:] = 0
    forest_info['Ntrees'][:] = ntrees_per_forest

    t0 = time.perf_counter()
    print(f"[Rank={rank}]: Calculating number of bytes per forest...")
    ends = np.cumsum(ntrees_per_forest)
    if ends[-1] != ntrees:
        msg = "Error: Something strange happened while computing the "\
              "number of bytes per tree. Expected the last element "\
              "of the array containing the cumulative sum of number "\
             f"of trees per forest = '{ends[-1]}' to equal the total number "\
             f"of trees being processed = '{ntrees}'. Please check "\
              "that the TreeRootID and ForestID values are unique...exiting"
        raise AssertionError(msg)

    starts = np.zeros_like(ends, dtype=np.int64)
    starts[1:] = ends[0:-1]

    treenbytes = trees_and_locations['TreeNbytes'][:]
    if treenbytes.min() <= 0:
        msg = "Error: While computing the number of bytes per forest. "\
              "Expected *all* trees to contain at least one entry "\
              "(i.e., span non-zero bytes in the input file)."\
              "However, minimum of the array containing the "\
             f"number of bytes per tree = '{treenbytes.min()}'."\
              "Perhaps the input files are corrupt? Exiting..."
        raise AssertionError(msg)

    forestnbytes = np.array([treenbytes[start:end].sum()
                             for start, end in zip(starts, ends)])
    if forestnbytes.shape[0] != nforests:
        msg = "Error: There must be some logic issue in the code. Expected "\
              "that the shape of the array containing the number of bytes "\
             f"per forest = '{forestnbytes.shape[0]}' to be *exactly* "\
             f"equal to nforests = '{nforests}'. Exiting..."
        raise AssertionError(msg)

    # Since we have already checked that 'treenbytes.min() > 0'
    # --> no need to check that forestnbytes.min() is > 0

    forest_info['Input_ForestNbytes'][:] = forestnbytes
    t1 = time.perf_counter()
    print(f"[Rank={rank}]: Calculating number of bytes per forest...done. "
          f"Time taken = {t1-t0:0.2f} seconds")

    return forest_info


[docs]def get_all_parallel_ctrees_filenames(fname):
    """
    Returns three filenames corresponding to the ``'forests.list'``,
    ``'locations.dat'``, and ``'tree_*.dat'`` files *assuming* the naming
    convention the Uchuu collaboration's parallel Consistent-Trees code

    Parameters
    -----------

    fname: string, required
        A filename specifying the tree data file for a parallel
        Consistent-Trees soutput

    Returns
    --------

    forests_file, locations_file, treedata_file: strings, filenames
        The filenames corresponding to the ``'forests.list'``,
        ``'locations.dat'`` and the ``'tree_*_*_*.dat'`` file
        *generated using* the convention of the Uchuu collaboration.
        The convention is:

            ================   ==================
            Standard CTrees    Parallel CTrees
            ================   ==================
            'forests.list'     '<prefix>.forest'
            'locations.dat'    '<prefix>.loc'
            'tree_*_*_*.dat'   '<prefix>.tree'
            ================   ==================

    """
    import os

    fname = check_and_decompress(fname)

    # Need to create the three filenames for the
    # forests.list, locations.dat and tree*.dat files
    dirname = os.path.dirname(fname)

    # Get the filename and remove the file extension
    basename, _ = os.path.splitext(os.path.basename(fname))

    forests_file = f'{dirname}/{basename}.forest'
    locations_file = f'{dirname}/{basename}.loc'
    treedata_file = f'{dirname}{basename}.tree'

    return forests_file, locations_file, treedata_file


[docs]def check_forests_locations_filenames(filenames):
    """
    Accepts two filenames (in any order) and checks whether
    the files contain the correct data as expected
    in 'forests.list', 'locations.dat' and returns the
    files as (equivalent to) 'forests.list' and 'locations.dat'

    Parameters
    -----------

    filenames: list of two filenames, string, required
        List containing two filenames corresponding to the standard
        'forests.list' and 'locations.dat' (in any order, i.e.,
        both ['forests.list', 'locations.dat'] *and*
        ['locations.dat', 'forests.list'] are valid)

    Returns
    --------

    forests_file, locations_file: strings, filenames
        The filenames equivalent to the 'forests.list', 'locations.dat'
        (in that order)

    """
    if len(filenames) != 2:
        msg = "Error: Expected to get only two filenames. Instead got "\
              f"filenames = '{filenames}' with len(filenames) = "\
              f"{len(filenames)}"
        raise AssertionError(msg)

    forests_file, locations_file = filenames[0], filenames[1]

    # Check if the first line in the potential `forests_file` contains
    # the expected 'ForestID'. We could even check that the entire
    # first line matches '#TreeRootID ForestID'
    with generic_reader(forests_file, 'rt') as f:
        line = f.readline()
        if 'ForestID' not in line:
            forests_file, locations_file = locations_file, forests_file

    # Now check that the locations file is a valid CTrees file
    with generic_reader(locations_file, 'rt') as f:
        line = f.readline()
        if 'FileID' not in line or \
           'Offset' not in line or \
           'Filename' not in line:
            msg = f"Error: The first line in the locations_file = "\
                  f"'{locations_file}' does not contain the expected "\
                  f"fields. First line = '{line}'"
            raise AssertionError(msg)

    return forests_file, locations_file


[docs]def validate_inputs_are_ctrees_files(ctrees_filenames, base_metadata=None,
                                     base_version=None,
                                     base_input_catalog_type=None):
    """
    Checks the files contain Consistent-Trees catalogues derived from
    the same simulation and Consistent-Trees configuration.

    Parameters
    -----------

    ctrees_filenames: list of filenames, string, required
        The input filenames (potentially) containing Consistent-Trees
        catalogues.

        Note: Only unique filenames within this list are checked

    Returns
    --------

    True on successful validation, ValueError otherwise


    """
    import numpy as np

    # Read the entire header meta-data from the first file
    files = np.unique(ctrees_filenames)
    if not base_version:
        base_metadata_dict = get_metadata(files[0])
        base_metadata = np.string_(base_metadata_dict['metadata'])
        base_version = base_metadata_dict['version']
        base_input_catalog_type = base_metadata_dict['catalog_type']

    for fname in files:
        metadata_dict = get_metadata(fname)
        metadata = np.string_(metadata_dict['metadata'])
        version = metadata_dict['version']
        input_catalog_type = metadata_dict['catalog_type']

        if 'Consistent' not in input_catalog_type:
            msg = "Error: This script is meant *only* to process "\
                  "Consistent-Tree catalogs. Found catalog type = "\
                  f"{input_catalog_type} instead (input file = {fname})"
            raise ValueError(msg)

        if 'hlist' in input_catalog_type:
            msg = "Error: This script is meant *only* to process "\
                  "Consistent-Tree catalogs containing tree data. "\
                  "Found a halo catalogue (generated by "\
                  "Consistent-Trees)\" instead. catalog type = "\
                  f"{input_catalog_type} instead (input file = {fname})"
            raise ValueError(msg)

        if base_input_catalog_type != input_catalog_type:
            msg = f"Error: Catalog type = '{input_catalog_type}' for file "\
                  f"'{fname}' does *not* match the catalog type = "\
                  f"'{base_input_catalog_type}' for file '{files[0]}'"
            raise ValueError(msg)

        if base_version != version:
            msg = f"Error: Version = '{version}' for file '{fname}' does "\
                  f"*not* match the version = '{base_version}' for "\
                  f"file '{files[0]}'"
            raise ValueError(msg)

        if not np.array_equal(base_metadata, metadata):
            msg = f"Error: metadata = '{metadata}' for file '{fname}' "\
                  f"does *not* match the metadata = '{base_metadata}' "\
                  f"for file '{files[0]}'"
            raise ValueError(msg)

    return True


def assign_fofids(forest, rank=0):
    """
    Fills the FofID field for all halos.

    Parameters
    -----------

    forest: numpy structured array, required
        An array containing all halos from the same forest

        Note: The tree-walking indices (i.e., columns returned by
        the function ``get_treewalk_dtype_descr``) are expected to be
        filled with -1.

    rank: integer, optional, default=0
        The unique identifier for the current task. Only used
        within an error statement

    Returns
    --------
        The FofID field is filled in-place, and this function has
        no return value

    """

    import numpy as np

    # Assign fofhaloids to the halos
    fofhalos = (np.where(forest['pid'] == -1))[0]
    if len(fofhalos) == 0:
        nhalos = forest.shape[0]
        msg = f"Error: There are no FOF halos among these {nhalos} passed.\n"
        msg += f"forest = {forest}"
        raise ValueError(msg)

    # First fix the upid, and assign the (self) FofIDs
    # to the FOF halos
    fofhalo_ids = forest['id'][fofhalos]
    forest['upid'][fofhalos] = fofhalo_ids
    forest['FofID'][fofhalos] = fofhalo_ids

    # First set the upid, fofid for the FOFs
    haloids = forest['id'][:]
    sorted_halo_idx = np.argsort(haloids)

    rem_sub_inds = (np.where(forest['FofID'] == -1))[0]
    nleft = len(rem_sub_inds)
    while nleft > 0:
        # Need to match un-assigned (pid, upid) with fofid, id of halos with
        # assigned fofid
        nassigned = 0
        for fld in ['upid', 'pid']:
            if len(rem_sub_inds) != nleft:
                msg = "Error: (Bug in code) Expected to still have some "\
                      f"remaining subhalos that needed assigning, since "\
                      f"nleft = {nleft} "\
                      f"with len(rem_sub_inds) = {len(rem_sub_inds)}\n"\
                      f"rem_sub_inds = {rem_sub_inds}."
                raise AssertionError(msg)

            fld_id = forest[fld][rem_sub_inds]

            # 'pid'/'upid' might be duplicated ->
            # np.intersect1d will only return one value; other
            # halos with the same 'pid' will not get updated to the
            # correct fof. Need a 'searchsorted' like implementation
            # but know that not *all* pids will be found
            fld_idx = np.searchsorted(haloids, fld_id, sorter=sorted_halo_idx)
            forest['FofID'][rem_sub_inds] = forest['FofID'][sorted_halo_idx[fld_idx]]
            nassigned += len(rem_sub_inds)
            rem_sub_inds = np.where(forest['FofID'] == -1)[0]
            nassigned -= len(rem_sub_inds)
            msg = f"Error: nassigned = {nassigned} must be at least 0."
            assert nassigned >= 0, msg
            nleft = len(rem_sub_inds)
            if nleft == 0:
                break

        if nleft > 0 and nassigned == 0:
            msg = f"[Rank={rank}]: Error: There are {nleft} halos left "\
                  f"without fofhalos assigned but could not assign a "\
                  f"single fof halo in this iteration\n"
            xx = np.where(forest['FofID'] == -1)[0]
            msg += f"forest['pid'][xx] = {forest['pid'][xx]}\n"
            msg += f"forest['upid'][xx] = {forest['upid'][xx]}\n"
            msg += f"forest[xx] = {forest[xx]}\n"
            pid = forest['pid'][xx]
            xx = np.where(forest['id'] == pid)[0]
            msg += f"pid = {pid} forest['id'] = {forest['id'][xx]} "\
                   f"(should be equal to 'pid')\n"
            msg += f"fofid for the 'pid' halo = {forest['FofID'][xx]}"
            raise ValueError(msg)

    # Reset the FOF upid field
    assert forest['pid'][fofhalos].min() == -1
    assert forest['pid'][fofhalos].max() == -1
    forest['upid'][fofhalos] = -1

    return


[docs]def get_treewalk_dtype_descr():
    """
    Returns the description for the additional fields
    to add to the forest for walking the mergertree

    Parameters
    -----------
    None

    Returns
    --------
    mergertree_descr: list of tuples
        A list of tuples containing the names and datatypes
        for the additional columns needed for walking the
        mergertree. This list can be used to create a numpy
        datatype suitable to contain the additional mergertree
        indices

    """

    import numpy as np
    mergertree_descr = [('FofID', np.int64),
                        ('FirstHaloInFOFgroup', np.int64),
                        ('NextHaloInFOFgroup', np.int64),
                        ('PrevHaloInFOFgroup', np.int64),
                        ('FirstProgenitor', np.int64),
                        ('NextProgenitor', np.int64),
                        ('PrevProgenitor', np.int64),
                        ('Descendant', np.int64)]

    return mergertree_descr


[docs]def add_tree_walk_indices(forest, rank=0):
    """
    Adds the various mergertree walking indices and IDs.
    These include indices to access the host FOF halo<->subhalo
    hierarchy, and the progenitor-descendant hierarchy.

    The mergertree indices are filled in-place and no additional
    return occurs. The specific indices are listed
    in the function :func:`get_treewalk_dtype_descr`

    Parameters
    -----------

    forest: numpy structured array, required
        An array containing all halos from the same forest

    rank: integer, optional, default=0
        The unique identifier for the current task. Only used
        within an error statement

    Returns
    --------

    True on successful completion
    """
    import numpy as np

    mergertree_descr = get_treewalk_dtype_descr()

    # initialise all the mergertree indices
    for name, _ in mergertree_descr:
        forest[name][:] = -1

    # Assign the FOFhalo IDs
    assign_fofids(forest, rank)

    # Assign the index for the fofhalo
    forest['Mvir'] *= -1  # hack so that the sort is decreasing order
    order = ['scale', 'FofID', 'upid', 'Mvir', 'id']
    sorted_fof_order = forest.argsort(order=order)
    forest['Mvir'] *= -1  # reset mass back

    # upid of FOFs is set to -1, therefore the FOF halo should be the first
    # halo with that FOFID (i.e., equal to its own haloID)
    uniq_fofs, firstfof_inds = np.unique(forest['FofID'][sorted_fof_order],
                                         return_index=True)

    nextsub_loc = np.split(sorted_fof_order, firstfof_inds[1:])
    for lhs in nextsub_loc:
        assert forest['FofID'][lhs].min() == forest['FofID'][lhs].max()
        assert forest['FofID'][lhs].min() == forest['id'][lhs[0]]
        forest['FirstHaloInFOFgroup'][lhs] = lhs[0]
        forest['NextHaloInFOFgroup'][lhs] = [*lhs[1:], -1]
        forest['PrevHaloInFOFgroup'][lhs] = [-1, *lhs[:-1]]

    # Now match the descendants
    haloids = forest['id'][:]
    desc_ids = forest['desc_id'][:]
    sorted_halo_idx = np.argsort(haloids)

    valid_desc_idx = desc_ids != -1
    desc_ind = np.searchsorted(haloids, desc_ids[valid_desc_idx],
                               sorter=sorted_halo_idx)

    msg = "np.searchsorted did not work as expected"
    np.testing.assert_array_equal(desc_ids[valid_desc_idx],
                                  haloids[sorted_halo_idx[desc_ind]],
                                  err_msg=msg)
    forest['Descendant'][valid_desc_idx] = sorted_halo_idx[desc_ind]

    # Check that *all* desc_ids were found
    msg = "Expected to find *all* descendants"
    assert set(haloids[sorted_halo_idx[desc_ind]]) == set(desc_ids[valid_desc_idx]), msg

    np.testing.assert_array_equal(forest['desc_id'][valid_desc_idx],
                                  forest['id'][sorted_halo_idx[desc_ind]],
                                  err_msg='descendant ids are incorrect')
    np.testing.assert_array_equal(forest['desc_scale'][valid_desc_idx],
                                  forest['scale'][sorted_halo_idx[desc_ind]],
                                  err_msg='descendant scales are incorrect')

    # Now assign (firstprog, nextprog)
    # Sort on the descendants and then mass, but only get the index
    # This is confusing, but really what the sorted index refers to are the
    # progenitors and this is the order in which the progenitors should be
    # assigned as FirstProg, NextProg
    forest['Mvir'] *= -1  # hack so that the sort is decreasing order
    sorted_prog_inds = np.argsort(forest, order=['desc_id', 'Mvir'])
    forest['Mvir'] *= -1  # reset the masses back

    # t0 = time.perf_counter()
    valid_desc_idx = np.where(forest['desc_id'][sorted_prog_inds] != -1)[0]
    uniq_descid, firstprog_inds = \
        np.unique(forest['desc_id'][sorted_prog_inds[valid_desc_idx]],
                  return_index=True)
    desc_for_firstprog_idx = np.searchsorted(haloids, uniq_descid,
                                             sorter=sorted_halo_idx)

    # np.testing.assert_array_equal(firstprog, forest['FirstProgenitor'])
    forest['FirstProgenitor'][sorted_halo_idx[desc_for_firstprog_idx]] = \
        sorted_prog_inds[valid_desc_idx[firstprog_inds]]

    # Check that the firstprog assignment was okay
    xx = np.where(forest['FirstProgenitor'] != -1)[0]
    firstprog = forest['FirstProgenitor'][xx]
    np.testing.assert_array_equal(forest['desc_id'][firstprog],
                                  forest['id'][xx])

    # See https://stackoverflow.com/a/53507580 and
    # https://stackoverflow.com/a/54736464
    valid_prog_loc = np.split(sorted_prog_inds[valid_desc_idx],
                              firstprog_inds[1:])
    for lhs in valid_prog_loc:
        assert forest['desc_id'][lhs].min() == forest['desc_id'][lhs].max()
        desc = forest['Descendant'][lhs[0]]
        assert forest['FirstProgenitor'][desc] == lhs[0]
        mvir = forest['Mvir'][lhs]
        assert np.all(np.diff(mvir) <= 0.0)
        forest['NextProgenitor'][lhs] = [*lhs[1:], -1]
        forest['PrevProgenitor'][lhs] = [-1, *lhs[0:-1]]

    return True