r"""
Modules performing small, commonly used tasks throughout the package.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
from math import ceil
import numpy as np
from warnings import warn
from astropy.table import Table
from ..custom_exceptions import HalotoolsError
__all__ = ['SampleSelector']
def compute_conditional_percentiles(**kwargs):
r"""
In bins of the ``prim_haloprop``, compute the rank-order percentile
of the input ``table`` based on the value of ``sec_haloprop``.
Parameters
----------
table : astropy table, optional
a keyword argument that stores halo catalog being used to make mock galaxy population
If a `table` is passed, the `prim_haloprop_key` and `sec_haloprop_key` keys
must also be passed. If not passing a `table`, you must directly pass the
`prim_haloprop` and `sec_haloprop` keyword arguments.
prim_haloprop_key : string, optional
Name of the column of the input ``table`` that will be used to access the
primary halo property. `compute_conditional_percentiles` bins the ``table`` by
``prim_haloprop_key`` when computing the result.
sec_haloprop_key : string, optional
Name of the column of the input ``table`` that will be used to access the
secondary halo property. `compute_conditional_percentiles` bins the ``table`` by
``prim_haloprop_key``, and in each bin uses the value stored in ``sec_haloprop_key``
to compute the ``prim_haloprop``-conditioned rank-order percentile.
prim_haloprop : array_like, optional
Array storing the primary halo property used to bin the input points.
If a `prim_haloprop` is passed, you must also pass a `sec_haloprop`.
sec_haloprop : array_like, optional
Array storing the secondary halo property used to define the conditional percentiles
in each bin of `prim_haloprop`.
prim_haloprop_bin_boundaries : array, optional
Array defining the boundaries by which we will bin the input ``table``.
Default is None, in which case the binning will be automatically determined using
the ``dlog10_prim_haloprop`` keyword.
dlog10_prim_haloprop : float, optional
Logarithmic spacing of bins of the mass-like variable within which
we will assign secondary property percentiles. Default is 0.2.
Examples
--------
>>> from halotools.sim_manager import FakeSim
>>> fakesim = FakeSim()
>>> result = compute_conditional_percentiles(table = fakesim.halo_table, prim_haloprop_key = 'halo_mvir', sec_haloprop_key = 'halo_vmax')
Notes
-----
The sign of the result is such that in bins of the primary property,
*smaller* values of the secondary property
receive *smaller* values of the returned percentile.
"""
if 'table' in kwargs:
table = kwargs['table']
try:
prim_haloprop_key = kwargs['prim_haloprop_key']
prim_haloprop = table[prim_haloprop_key]
sec_haloprop_key = kwargs['sec_haloprop_key']
sec_haloprop = table[sec_haloprop_key]
except KeyError:
msg = ("\nWhen passing an input ``table`` to the ``compute_conditional_percentiles`` method,\n"
"you must also pass ``prim_haloprop_key`` and ``sec_haloprop_key`` keyword arguments\n"
"whose values are column keys of the input ``table``\n")
raise HalotoolsError(msg)
else:
try:
prim_haloprop = kwargs['prim_haloprop']
sec_haloprop = kwargs['sec_haloprop']
except KeyError:
msg = ("\nIf not passing an input ``table`` to the ``compute_conditional_percentiles`` method,\n"
"you must pass a ``prim_haloprop`` and ``sec_haloprop`` arguments\n")
raise HalotoolsError(msg)
def compute_prim_haloprop_bins(dlog10_prim_haloprop=0.05, **kwargs):
r"""
Parameters
----------
prim_haloprop : array
Array storing the value of the primary halo property column of the ``table``
passed to ``compute_conditional_percentiles``.
prim_haloprop_bin_boundaries : array, optional
Array defining the boundaries by which we will bin the input ``table``.
Default is None, in which case the binning will be automatically determined using
the ``dlog10_prim_haloprop`` keyword.
dlog10_prim_haloprop : float, optional
Logarithmic spacing of bins of the mass-like variable within which
we will assign secondary property percentiles. Default is 0.2.
Returns
--------
output : array
Numpy array of integers storing the bin index of the prim_haloprop bin
to which each halo in the input table was assigned.
"""
try:
prim_haloprop = kwargs['prim_haloprop']
except KeyError:
msg = ("The ``compute_prim_haloprop_bins`` method "
"requires the ``prim_haloprop`` keyword argument")
raise HalotoolsError(msg)
try:
prim_haloprop_bin_boundaries = kwargs['prim_haloprop_bin_boundaries']
except KeyError:
lg10_min_prim_haloprop = np.log10(np.min(prim_haloprop))-0.001
lg10_max_prim_haloprop = np.log10(np.max(prim_haloprop))+0.001
num_prim_haloprop_bins = (lg10_max_prim_haloprop-lg10_min_prim_haloprop)/dlog10_prim_haloprop
prim_haloprop_bin_boundaries = np.logspace(
lg10_min_prim_haloprop, lg10_max_prim_haloprop,
num=int(ceil(num_prim_haloprop_bins)))
# digitize the masses so that we can access them bin-wise
output = np.digitize(prim_haloprop, prim_haloprop_bin_boundaries)
# Use the largest bin for any points larger than the largest bin boundary,
# and raise a warning if such points are found
Nbins = len(prim_haloprop_bin_boundaries)
if Nbins in output:
msg = ("\n\nThe ``compute_prim_haloprop_bins`` function detected points in the \n"
"input array of primary halo property that were larger than the largest value\n"
"of the input ``prim_haloprop_bin_boundaries``. All such points will be assigned\n"
"to the largest bin.\nBe sure that this is the behavior you expect for your application.\n\n")
warn(msg)
output = np.where(output == Nbins, Nbins-1, output)
return output
compute_prim_haloprop_bins_dict = {}
compute_prim_haloprop_bins_dict['prim_haloprop'] = prim_haloprop
try:
compute_prim_haloprop_bins_dict['prim_haloprop_bin_boundaries'] = (
kwargs['prim_haloprop_bin_boundaries'])
except KeyError:
pass
try:
compute_prim_haloprop_bins_dict['dlog10_prim_haloprop'] = kwargs['dlog10_prim_haloprop']
except KeyError:
pass
prim_haloprop_bins = compute_prim_haloprop_bins(**compute_prim_haloprop_bins_dict)
output = np.zeros_like(prim_haloprop)
# sort on secondary property only with each mass bin
bins_in_halocat = set(prim_haloprop_bins)
for ibin in bins_in_halocat:
indices_of_prim_haloprop_bin = np.where(prim_haloprop_bins == ibin)[0]
num_in_bin = len(sec_haloprop[indices_of_prim_haloprop_bin])
# Find the indices that sort by the secondary property
ind_sorted = np.argsort(sec_haloprop[indices_of_prim_haloprop_bin])
percentiles = np.zeros(num_in_bin)
percentiles[ind_sorted] = (np.arange(num_in_bin) + 1.0) / float(num_in_bin)
# place the percentiles into the catalog
output[indices_of_prim_haloprop_bin] = percentiles
return output
[docs]
class SampleSelector(object):
r""" Container class for commonly used sample selections.
"""
[docs]
@staticmethod
def host_halo_selection(return_subhalos=False, **kwargs):
r""" Method divides sample in to host halos and subhalos, and returns
either the hosts or the hosts and the subs depending
on the value of the input ``return_subhalos``.
"""
table = kwargs['table']
mask = table['halo_upid'] == -1
if return_subhalos is False:
return table[mask]
else:
return table[mask], table[~mask]
[docs]
@staticmethod
def property_range(lower_bound=-float("inf"), upper_bound=float("inf"),
return_complement=False, host_halos_only=False, subhalos_only=False, **kwargs):
r""" Method makes a cut on an input table column based on an input upper and lower bound, and
returns the cut table.
Parameters
----------
table : Astropy Table object, keyword argument
key : string, keyword argument
Column name that will be used to apply the cut
lower_bound : float, optional keyword argument
Minimum value for the input column of the returned table. Default is :math:`-\infty`.
upper_bound : float, optional keyword argument
Maximum value for the input column of the returned table. Default is :math:`+\infty`.
return_complement : bool, optional keyword argument
If True, `property_range` gives the table elements that do not pass the cut
as the second return argument. Default is False.
host_halos_only : bool, optional keyword argument
If true, `property_range` will use the `host_halo_selection` method to
make an additional cut on the sample so that only host halos are returned.
Default is False
subhalos_only : bool, optional keyword argument
If true, `property_range` will use the `host_halo_selection` method to
make an additional cut on the sample so that only subhalos are returned.
Default is False
Returns
-------
cut_table : Astropy Table object
Examples
---------
To demonstrate the `property_range` method, we will start out by loading
a table of halos into memory using the `FakeSim` class:
>>> from halotools.sim_manager import FakeSim
>>> halocat = FakeSim()
>>> halos = halocat.halo_table
To make a cut on the halo catalog to select halos in a specific mass range:
>>> halo_sample = SampleSelector.property_range(table = halos, key = 'halo_mvir', lower_bound = 1e12, upper_bound = 1e13)
To apply this same cut, and also only select host halos passing the cut, we use the ``host_halos_only`` keyword:
>>> host_halo_sample = SampleSelector.property_range(table = halos, key = 'halo_mvir', lower_bound = 1e12, upper_bound = 1e13, host_halos_only=True)
The same applies if we only want subhalos returned only now we use the ``subhalos_only`` keyword:
>>> subhalo_sample = SampleSelector.property_range(table = halos, key = 'halo_mvir', lower_bound = 1e12, upper_bound = 1e13, subhalos_only=True)
"""
table = kwargs['table']
# First apply the host halo cut, if applicable
if (host_halos_only is True) & (subhalos_only is True):
raise KeyError("You cannot simultaneously select only host halos and only subhalos")
elif host_halos_only is True:
table = SampleSelector.host_halo_selection(table=table)
elif subhalos_only is True:
hosts, table = SampleSelector.host_halo_selection(table=table, return_subhalos=True)
key = kwargs['key']
mask = (table[key] >= lower_bound) & (table[key] <= upper_bound)
if return_complement is True:
return table[mask], table[np.invert(mask)]
else:
return table[mask]
[docs]
@staticmethod
def split_sample(**kwargs):
r""" Method divides a sample into subsamples based on the percentile ranking of a given property.
Parameters
----------
table : Astropy Table object, keyword argument
key : string, keyword argument
Column name that will be used to define the percentiles
percentiles : array_like
Sequence of percentiles used to define the returned subsamples. If ``percentiles``
has more than one element, the elements must be monotonically increasing.
If ``percentiles`` is length-N, there will be N+1 returned subsamples.
Returns
-------
subsamples : list
Examples
--------
To demonstrate the `split_sample` method, we will start out by loading
a table of halos into memory using the `FakeSim` class:
>>> from halotools.sim_manager import FakeSim
>>> halocat = FakeSim()
>>> halos = halocat.halo_table
We can easily use `split_sample` to divide the sample into a high-Vmax and low-Vmax subsamples:
>>> sample_below_median, sample_above_median = SampleSelector.split_sample(table = halos, key = 'halo_vmax', percentiles = 0.5)
Likewise, we can do the same thing to divide the sample into quartiles:
>>> lowest, lower, higher, highest = SampleSelector.split_sample(table = halos, key = 'halo_zhalf', percentiles = [0.25, 0.5, 0.75])
The following alternative syntax is also supported:
>>> subsample_collection = SampleSelector.split_sample(table = halos, key = 'halo_zhalf', percentiles = [0.25, 0.5, 0.75])
>>> lowest, lower, higher, highest = subsample_collection
"""
table = kwargs['table']
if not isinstance(table, Table):
raise TypeError("Input table must be an Astropy Table instance")
key = kwargs['key']
if key not in list(table.keys()):
raise KeyError("The ``{0}`` key does not appear in the table you are trying \n"
"to split into subsamples".format(key))
table.sort(key)
percentiles = kwargs['percentiles']
percentiles = np.array(percentiles)
if np.shape(percentiles) == ():
percentiles = np.array([percentiles])
num_total = len(table)
if len(percentiles) >= num_total:
raise ValueError("Input length of percentiles must be less than input table length")
indices = percentiles*num_total
indices = np.insert(indices, 0, 0)
percentiles = np.insert(percentiles, 0, 0)
indices = indices.astype(int)
indices = np.append(indices, len(table))
percentiles = np.append(percentiles, 1.0)
d = np.diff(indices)
d[-1] -= 1
if 0 in d:
print("Raise exception: too many percentile bins")
idx_too_few = np.nanargmin(d)
raise ValueError("The input percentiles spacing is too fine.\n"
"For example, there are no table elements in the percentile range (%.2f, %.2f)" %
(percentiles[idx_too_few], percentiles[idx_too_few+1]))
result = np.zeros(len(indices)-1, dtype=object)
for i, first_idx, last_idx in zip(list(range(len(result))), indices[:-1], indices[1:]):
result[i] = table[first_idx:last_idx]
return result