Source code for halotools.utils.table_utils

r"""
Modules performing small, commonly used tasks throughout the package.
"""

from __future__ import absolute_import, division, print_function, unicode_literals

from math import ceil
import numpy as np
from warnings import warn
from astropy.table import Table

from ..custom_exceptions import HalotoolsError

__all__ = ['SampleSelector']


def compute_conditional_percentiles(**kwargs):
    r"""
    In bins of the ``prim_haloprop``, compute the rank-order percentile
    of the input ``table`` based on the value of ``sec_haloprop``.

    Parameters
    ----------
    table : astropy table, optional
        a keyword argument that stores halo catalog being used to make mock galaxy population
        If a `table` is passed, the `prim_haloprop_key` and `sec_haloprop_key` keys
        must also be passed. If not passing a `table`, you must directly pass the
        `prim_haloprop` and `sec_haloprop` keyword arguments.

    prim_haloprop_key : string, optional
        Name of the column of the input ``table`` that will be used to access the
        primary halo property. `compute_conditional_percentiles` bins the ``table`` by
        ``prim_haloprop_key`` when computing the result.

    sec_haloprop_key : string, optional
        Name of the column of the input ``table`` that will be used to access the
        secondary halo property. `compute_conditional_percentiles` bins the ``table`` by
        ``prim_haloprop_key``, and in each bin uses the value stored in ``sec_haloprop_key``
        to compute the ``prim_haloprop``-conditioned rank-order percentile.

    prim_haloprop : array_like, optional
        Array storing the primary halo property used to bin the input points.
        If a `prim_haloprop` is passed, you must also pass a `sec_haloprop`.

    sec_haloprop : array_like, optional
        Array storing the secondary halo property used to define the conditional percentiles
        in each bin of `prim_haloprop`.

    prim_haloprop_bin_boundaries : array, optional
        Array defining the boundaries by which we will bin the input ``table``.
        Default is None, in which case the binning will be automatically determined using
        the ``dlog10_prim_haloprop`` keyword.

    dlog10_prim_haloprop : float, optional
        Logarithmic spacing of bins of the mass-like variable within which
        we will assign secondary property percentiles. Default is 0.2.

    Examples
    --------
    >>> from halotools.sim_manager import FakeSim
    >>> fakesim = FakeSim()
    >>> result = compute_conditional_percentiles(table = fakesim.halo_table, prim_haloprop_key = 'halo_mvir', sec_haloprop_key = 'halo_vmax')


    Notes
    -----
    The sign of the result is such that in bins of the primary property,
    *smaller* values of the secondary property
    receive *smaller* values of the returned percentile.

    """

    if 'table' in kwargs:
        table = kwargs['table']
        try:
            prim_haloprop_key = kwargs['prim_haloprop_key']
            prim_haloprop = table[prim_haloprop_key]
            sec_haloprop_key = kwargs['sec_haloprop_key']
            sec_haloprop = table[sec_haloprop_key]
        except KeyError:
            msg = ("\nWhen passing an input ``table`` to the ``compute_conditional_percentiles`` method,\n"
                "you must also pass ``prim_haloprop_key`` and ``sec_haloprop_key`` keyword arguments\n"
                "whose values are column keys of the input ``table``\n")
            raise HalotoolsError(msg)
    else:
        try:
            prim_haloprop = kwargs['prim_haloprop']
            sec_haloprop = kwargs['sec_haloprop']
        except KeyError:
            msg = ("\nIf not passing an input ``table`` to the ``compute_conditional_percentiles`` method,\n"
                "you must pass a ``prim_haloprop`` and ``sec_haloprop`` arguments\n")
            raise HalotoolsError(msg)

    def compute_prim_haloprop_bins(dlog10_prim_haloprop=0.05, **kwargs):
        r"""
        Parameters
        ----------
        prim_haloprop : array
            Array storing the value of the primary halo property column of the ``table``
            passed to ``compute_conditional_percentiles``.

        prim_haloprop_bin_boundaries : array, optional
            Array defining the boundaries by which we will bin the input ``table``.
            Default is None, in which case the binning will be automatically determined using
            the ``dlog10_prim_haloprop`` keyword.

        dlog10_prim_haloprop : float, optional
            Logarithmic spacing of bins of the mass-like variable within which
            we will assign secondary property percentiles. Default is 0.2.

        Returns
        --------
        output : array
            Numpy array of integers storing the bin index of the prim_haloprop bin
            to which each halo in the input table was assigned.

        """
        try:
            prim_haloprop = kwargs['prim_haloprop']
        except KeyError:
            msg = ("The ``compute_prim_haloprop_bins`` method "
                "requires the ``prim_haloprop`` keyword argument")
            raise HalotoolsError(msg)

        try:
            prim_haloprop_bin_boundaries = kwargs['prim_haloprop_bin_boundaries']
        except KeyError:
            lg10_min_prim_haloprop = np.log10(np.min(prim_haloprop))-0.001
            lg10_max_prim_haloprop = np.log10(np.max(prim_haloprop))+0.001
            num_prim_haloprop_bins = (lg10_max_prim_haloprop-lg10_min_prim_haloprop)/dlog10_prim_haloprop
            prim_haloprop_bin_boundaries = np.logspace(
                lg10_min_prim_haloprop, lg10_max_prim_haloprop,
                num=int(ceil(num_prim_haloprop_bins)))

        # digitize the masses so that we can access them bin-wise
        output = np.digitize(prim_haloprop, prim_haloprop_bin_boundaries)

        # Use the largest bin for any points larger than the largest bin boundary,
        # and raise a warning if such points are found
        Nbins = len(prim_haloprop_bin_boundaries)
        if Nbins in output:
            msg = ("\n\nThe ``compute_prim_haloprop_bins`` function detected points in the \n"
                "input array of primary halo property that were larger than the largest value\n"
                "of the input ``prim_haloprop_bin_boundaries``. All such points will be assigned\n"
                "to the largest bin.\nBe sure that this is the behavior you expect for your application.\n\n")
            warn(msg)
            output = np.where(output == Nbins, Nbins-1, output)

        return output

    compute_prim_haloprop_bins_dict = {}
    compute_prim_haloprop_bins_dict['prim_haloprop'] = prim_haloprop
    try:
        compute_prim_haloprop_bins_dict['prim_haloprop_bin_boundaries'] = (
            kwargs['prim_haloprop_bin_boundaries'])
    except KeyError:
        pass
    try:
        compute_prim_haloprop_bins_dict['dlog10_prim_haloprop'] = kwargs['dlog10_prim_haloprop']
    except KeyError:
        pass
    prim_haloprop_bins = compute_prim_haloprop_bins(**compute_prim_haloprop_bins_dict)

    output = np.zeros_like(prim_haloprop)

    # sort on secondary property only with each mass bin
    bins_in_halocat = set(prim_haloprop_bins)
    for ibin in bins_in_halocat:
        indices_of_prim_haloprop_bin = np.where(prim_haloprop_bins == ibin)[0]

        num_in_bin = len(sec_haloprop[indices_of_prim_haloprop_bin])

        # Find the indices that sort by the secondary property
        ind_sorted = np.argsort(sec_haloprop[indices_of_prim_haloprop_bin])

        percentiles = np.zeros(num_in_bin)
        percentiles[ind_sorted] = (np.arange(num_in_bin) + 1.0) / float(num_in_bin)

        # place the percentiles into the catalog
        output[indices_of_prim_haloprop_bin] = percentiles

    return output



[docs]
class SampleSelector(object):
    r""" Container class for commonly used sample selections.
    """


[docs]
    @staticmethod
    def host_halo_selection(return_subhalos=False, **kwargs):
        r""" Method divides sample in to host halos and subhalos, and returns
        either the hosts or the hosts and the subs depending
        on the value of the input ``return_subhalos``.
        """
        table = kwargs['table']
        mask = table['halo_upid'] == -1
        if return_subhalos is False:
            return table[mask]
        else:
            return table[mask], table[~mask]



[docs]
    @staticmethod
    def property_range(lower_bound=-float("inf"), upper_bound=float("inf"),
            return_complement=False, host_halos_only=False, subhalos_only=False, **kwargs):
        r""" Method makes a cut on an input table column based on an input upper and lower bound, and
        returns the cut table.

        Parameters
        ----------
        table : Astropy Table object, keyword argument

        key : string, keyword argument
            Column name that will be used to apply the cut

        lower_bound : float, optional keyword argument
            Minimum value for the input column of the returned table. Default is :math:`-\infty`.

        upper_bound : float, optional keyword argument
            Maximum value for the input column of the returned table. Default is :math:`+\infty`.

        return_complement : bool, optional keyword argument
            If True, `property_range` gives the table elements that do not pass the cut
            as the second return argument. Default is False.

        host_halos_only : bool, optional keyword argument
            If true, `property_range` will use the `host_halo_selection` method to
            make an additional cut on the sample so that only host halos are returned.
            Default is False

        subhalos_only : bool, optional keyword argument
            If true, `property_range` will use the `host_halo_selection` method to
            make an additional cut on the sample so that only subhalos are returned.
            Default is False


        Returns
        -------
        cut_table : Astropy Table object

        Examples
        ---------
        To demonstrate the `property_range` method, we will start out by loading
        a table of halos into memory using the `FakeSim` class:

        >>> from halotools.sim_manager import FakeSim
        >>> halocat = FakeSim()
        >>> halos = halocat.halo_table

        To make a cut on the halo catalog to select halos in a specific mass range:

        >>> halo_sample = SampleSelector.property_range(table = halos, key = 'halo_mvir', lower_bound = 1e12, upper_bound = 1e13)

        To apply this same cut, and also only select host halos passing the cut, we use the ``host_halos_only`` keyword:

        >>> host_halo_sample = SampleSelector.property_range(table = halos, key = 'halo_mvir', lower_bound = 1e12, upper_bound = 1e13, host_halos_only=True)

        The same applies if we only want subhalos returned only now we use the ``subhalos_only`` keyword:

        >>> subhalo_sample = SampleSelector.property_range(table = halos, key = 'halo_mvir', lower_bound = 1e12, upper_bound = 1e13, subhalos_only=True)

        """
        table = kwargs['table']

        # First apply the host halo cut, if applicable
        if (host_halos_only is True) & (subhalos_only is True):
            raise KeyError("You cannot simultaneously select only host halos and only subhalos")
        elif host_halos_only is True:
            table = SampleSelector.host_halo_selection(table=table)
        elif subhalos_only is True:
            hosts, table = SampleSelector.host_halo_selection(table=table, return_subhalos=True)

        key = kwargs['key']
        mask = (table[key] >= lower_bound) & (table[key] <= upper_bound)

        if return_complement is True:
            return table[mask], table[np.invert(mask)]
        else:
            return table[mask]



[docs]
    @staticmethod
    def split_sample(**kwargs):
        r""" Method divides a sample into subsamples based on the percentile ranking of a given property.

        Parameters
        ----------
        table : Astropy Table object, keyword argument

        key : string, keyword argument
            Column name that will be used to define the percentiles

        percentiles : array_like
            Sequence of percentiles used to define the returned subsamples. If ``percentiles``
            has more than one element, the elements must be monotonically increasing.
            If ``percentiles`` is length-N, there will be N+1 returned subsamples.

        Returns
        -------
        subsamples : list

        Examples
        --------
        To demonstrate the `split_sample` method, we will start out by loading
        a table of halos into memory using the `FakeSim` class:

        >>> from halotools.sim_manager import FakeSim
        >>> halocat = FakeSim()
        >>> halos = halocat.halo_table

        We can easily use `split_sample` to divide the sample into a high-Vmax and low-Vmax subsamples:

        >>> sample_below_median, sample_above_median = SampleSelector.split_sample(table = halos, key = 'halo_vmax', percentiles = 0.5)

        Likewise, we can do the same thing to divide the sample into quartiles:

        >>> lowest, lower, higher, highest = SampleSelector.split_sample(table = halos, key = 'halo_zhalf', percentiles = [0.25, 0.5, 0.75])

        The following alternative syntax is also supported:

        >>> subsample_collection = SampleSelector.split_sample(table = halos, key = 'halo_zhalf', percentiles = [0.25, 0.5, 0.75])
        >>> lowest, lower, higher, highest = subsample_collection

        """
        table = kwargs['table']
        if not isinstance(table, Table):
            raise TypeError("Input table must be an Astropy Table instance")

        key = kwargs['key']
        if key not in list(table.keys()):
            raise KeyError("The ``{0}`` key does not appear in the table you are trying \n"
                "to split into subsamples".format(key))
        table.sort(key)

        percentiles = kwargs['percentiles']
        percentiles = np.array(percentiles)
        if np.shape(percentiles) == ():
            percentiles = np.array([percentiles])
        num_total = len(table)
        if len(percentiles) >= num_total:
            raise ValueError("Input length of percentiles must be less than input table length")

        indices = percentiles*num_total
        indices = np.insert(indices, 0, 0)
        percentiles = np.insert(percentiles, 0, 0)
        indices = indices.astype(int)
        indices = np.append(indices, len(table))
        percentiles = np.append(percentiles, 1.0)

        d = np.diff(indices)
        d[-1] -= 1
        if 0 in d:
            print("Raise exception: too many percentile bins")
            idx_too_few = np.nanargmin(d)
            raise ValueError("The input percentiles spacing is too fine.\n"
                "For example, there are no table elements in the percentile range (%.2f, %.2f)" %
                  (percentiles[idx_too_few], percentiles[idx_too_few+1]))

        result = np.zeros(len(indices)-1, dtype=object)
        for i, first_idx, last_idx in zip(list(range(len(result))), indices[:-1], indices[1:]):
            result[i] = table[first_idx:last_idx]

        return result
Navigation

Source code for halotools.utils.table_utils