Source code for halotools.utils.crossmatch

r""" Module containing the `~halotools.utils.crossmatch` function used to
calculate indices providing the correspondence between two data tables
sharing a common objectID.
"""
import numpy as np


__all__ = ("crossmatch", "compute_richness")



[docs]
def crossmatch(x, y, skip_bounds_checking=False):
    r"""
    Finds where the elements of ``x`` appear in the array ``y``, including repeats.

    The elements in x may be repeated, but the elements in y must be unique.
    The arrays x and y may be only partially overlapping.

    The applications of this function envolve cross-matching
    two catalogs/data tables which share an objectID.
    For example, if you have a primary data table and a secondary data table containing
    supplementary information about (some of) the objects, the
    `~halotools.utils.crossmatch` function can be used to "value-add" the
    primary table with data from the second.

    For another example, suppose you have a single data table
    with an object ID column and also a column for a "host" ID column
    (e.g., ``halo_hostid`` in Halotools-provided catalogs),
    you can use the `~halotools.utils.crossmatch` function to create new columns
    storing properties of the associated host.

    See :ref:`crossmatching_halo_catalogs` and :ref:`crossmatching_galaxy_catalogs`
    for tutorials on common usages of this function with halo and galaxy catalogs.

    Parameters
    ----------
    x : integer array
        Array of integers with possibly repeated entries.

    y : integer array
        Array of unique integers.

    skip_bounds_checking : bool, optional
        The first step in the `crossmatch` function is to test that the input
        arrays satisfy the assumptions of the algorithm
        (namely that ``x`` and ``y`` store integers,
        and that all values in ``y`` are unique).
        If ``skip_bounds_checking`` is set to True,
        this testing is bypassed and the function evaluates faster.
        Default is False.

    Returns
    -------
    idx_x : integer array
        Integer array used to apply a mask to x
        such that x[idx_x] == y[idx_y]

    y_idx : integer array
        Integer array used to apply a mask to y
        such that x[idx_x] == y[idx_y]

    Notes
    -----
    The matching between ``x`` and ``y`` is done on the sorted arrays.  A consequence of
    this is that x[idx_x] and y[idx_y] will generally be a subset of ``x`` and ``y`` in
    sorted order.

    Examples
    --------
    Let's create some fake data to demonstrate basic usage of the function.
    First, let's suppose we have two tables of objects, ``table1`` and ``table2``.
    There are no repeated elements in any table, but these tables only partially overlap.
    The example below demonstrates how to transfer column data from ``table2``
    into ``table1`` for the subset of objects that appear in both tables.

    >>> num_table1 = int(1e6)
    >>> x = np.random.rand(num_table1)
    >>> objid = np.arange(num_table1)
    >>> from astropy.table import Table
    >>> table1 = Table({'x': x, 'objid': objid})

    >>> num_table2 = int(1e6)
    >>> objid = np.arange(5e5, num_table2+5e5)
    >>> y = np.random.rand(num_table2)
    >>> table2 = Table({'y': y, 'objid': objid})

    Note that ``table1`` and ``table2`` only partially overlap. In the code below,
    we will initialize a new ``y`` column for ``table1``, and for those rows
    with an ``objid`` that appears in both ``table1`` and ``table2``,
    we'll transfer the values of ``y`` from ``table2`` to ``table1``.

    >>> idx_table1, idx_table2 = crossmatch(table1['objid'].data, table2['objid'].data)
    >>> table1['y'] = np.zeros(len(table1), dtype = table2['y'].dtype)
    >>> table1['y'][idx_table1] = table2['y'][idx_table2]

    Now we'll consider a slightly more complicated example in which there
    are repeated entries in the input array ``x``. Suppose in this case that
    our data ``x`` comes with a natural grouping, for example into those
    galaxies that occupy a common halo. If we have a separate table ``y`` that
    stores attributes of the group, we may wish to broadcast some group property
    such as total group mass amongst all the group members.

    First create some new dummy data to demonstrate this application of
    the `crossmatch` function:

    >>> num_galaxies = int(1e6)
    >>> x = np.random.rand(num_galaxies)
    >>> objid = np.arange(num_galaxies)
    >>> num_groups = int(1e4)
    >>> groupid = np.random.randint(0, num_groups, num_galaxies)
    >>> galaxy_table = Table({'x': x, 'objid': objid, 'groupid': groupid})

    >>> groupmass = np.random.rand(num_groups)
    >>> groupid = np.arange(num_groups)
    >>> group_table = Table({'groupmass': groupmass, 'groupid': groupid})

    Now we use the `crossmatch` to paint the appropriate value of ``groupmass``
    onto each galaxy:

    >>> idx_galaxies, idx_groups = crossmatch(galaxy_table['groupid'].data, group_table['groupid'].data)
    >>> galaxy_table['groupmass'] = np.zeros(len(galaxy_table), dtype = group_table['groupmass'].dtype)
    >>> galaxy_table['groupmass'][idx_galaxies] = group_table['groupmass'][idx_groups]

    See the tutorials for additional demonstrations of alternative
    uses of the `crossmatch` function.

    See also
    ---------
    :ref:`crossmatching_halo_catalogs`

    :ref:`crossmatching_galaxy_catalogs`

    """
    # Ensure inputs are Numpy arrays
    x = np.atleast_1d(x)
    y = np.atleast_1d(y)

    # Require that the inputs meet the assumptions of the algorithm
    if skip_bounds_checking is True:
        pass
    else:
        try:
            assert len(set(y)) == len(y)
            assert np.all(np.array(y).astype(int) == y)
            assert np.shape(y) == (len(y),)
        except:
            msg = "Input array y must be a 1d sequence of unique integers"
            raise ValueError(msg)
        try:
            assert np.all(np.array(x).astype(int) == x)
            assert np.shape(x) == (len(x),)
        except:
            msg = "Input array x must be a 1d sequence of integers"
            raise ValueError(msg)

    # Internally, we will work with sorted arrays, and then undo the sorting at the end
    idx_x_sorted = np.argsort(x)
    idx_y_sorted = np.argsort(y)
    x_sorted = np.copy(x[idx_x_sorted])
    y_sorted = np.copy(y[idx_y_sorted])

    # x may have repeated entries, so find the unique values as well as their multiplicity
    unique_xvals, counts = np.unique(x_sorted, return_counts=True)

    # Determine which of the unique x values has a match in y
    unique_xval_has_match = np.in1d(unique_xvals, y_sorted, assume_unique=True)

    # Create a boolean array with True for each value in x with a match, otherwise False
    idx_x = np.repeat(unique_xval_has_match, counts)

    # For each unique value of x with a match in y, identify the index of the match
    matching_indices_in_y = np.searchsorted(
        y_sorted, unique_xvals[unique_xval_has_match]
    )

    # Repeat each matching index according to the multiplicity in x
    idx_y = np.repeat(matching_indices_in_y, counts[unique_xval_has_match])

    # Undo the original sorting and return the result
    return idx_x_sorted[idx_x], idx_y_sorted[idx_y]




[docs]
def compute_richness(unique_halo_ids, halo_id_of_galaxies):
    r"""For every ID in unique_halo_ids,
    calculate the number of times the ID appears in halo_id_of_galaxies.

    Parameters
    ----------
    unique_halo_ids : ndarray
        Numpy array of shape (num_halos, ) storing unique integers

    halo_id_of_galaxies : ndarray
        Numpy integer array of shape (num_galaxies, ) storing the host ID of each galaxy

    Returns
    -------
    richness : ndarray
        Numpy integer array of shape (num_halos, ) storing richness of each host halo

    Examples
    --------
    >>> num_hosts = 100
    >>> num_sats = int(1e5)
    >>> unique_halo_ids = np.arange(5, num_hosts + 5)
    >>> halo_id_of_galaxies = np.random.randint(0, 5000, num_sats)
    >>> richness = compute_richness(unique_halo_ids, halo_id_of_galaxies)
    """
    unique_halo_ids = np.atleast_1d(unique_halo_ids).astype(int)
    halo_id_of_galaxies = np.atleast_1d(halo_id_of_galaxies).astype(int)
    richness_result = np.zeros_like(unique_halo_ids).astype(int)

    vals, counts = np.unique(halo_id_of_galaxies, return_counts=True)
    idxA, idxB = crossmatch(vals, unique_halo_ids)
    richness_result[idxB] = counts[idxA]
    return richness_result
Navigation

Source code for halotools.utils.crossmatch