Source code for halotools.mock_observables.two_point_clustering.marked_tpcf

r"""
Module containing the `~halotools.mock_observables.marked_tpcf` function used to
calculate the marked two-point correlation function.
"""

from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
from astropy.utils.misc import NumpyRNGContext

from .clustering_helpers import process_optional_input_sample2

from ..mock_observables_helpers import (enforce_sample_has_correct_shape,
    get_separation_bins_array, get_period, get_num_threads)
from ..pair_counters.mesh_helpers import _enforce_maximum_search_length

from ..pair_counters import npairs_3d, marked_npairs_3d

from ...custom_exceptions import HalotoolsError


__all__ = ['marked_tpcf']
__author__ = ['Duncan Campbell']

np.seterr(divide='ignore', invalid='ignore')  # ignore divide by zero in e.g. DD/RR



[docs]
def marked_tpcf(sample1, rbins, sample2=None,
        marks1=None, marks2=None, period=None, do_auto=True, do_cross=True,
        num_threads=1, weight_func_id=1,
        normalize_by='random_marks', iterations=1, randomize_marks=None, seed=None):
    r"""
    Calculate the real space marked two-point correlation function, :math:`\mathcal{M}(r)`.

    Example calls to this function appear in the documentation below.
    See the :ref:`mock_obs_pos_formatting` documentation page for
    instructions on how to transform your coordinate position arrays into the
    format accepted by the ``sample1`` and ``sample2`` arguments.

    Parameters
    ----------
    sample1 : array_like
        Npts1 x 3 numpy array containing 3-D positions of points.
        See the :ref:`mock_obs_pos_formatting` documentation page, or the
        Examples section below, for instructions on how to transform
        your coordinate position arrays into the
        format accepted by the ``sample1`` and ``sample2`` arguments.
        Length units are comoving and assumed to be in Mpc/h, here and throughout Halotools.

    rbins : array_like
        array of boundaries defining the real space radial bins in which pairs are counted.
        Length units are comoving and assumed to be in Mpc/h, here and throughout Halotools.

    sample2 : array_like, optional
        Npts2 x 3 array containing 3-D positions of points.
        Passing ``sample2`` as an input permits the calculation of
        the cross-correlation function. Default is None, in which case only the
        auto-correlation function will be calculated.

    marks1 : array_like, optional
        len(sample1) x N_marks array of marks.  The supplied marks array must have the
        appropriate shape for the chosen ``weight_func_id`` (see Notes for requirements).  If this
        parameter is not specified, it is set to numpy.ones((len(sample1), N_marks)).

    marks2 : array_like, optional
        len(sample2) x N_marks array of marks.  The supplied marks array must have the
        appropriate shape for the chosen ``weight_func_id`` (see Notes for requirements).  If this
        parameter is not specified, it is set to numpy.ones((len(sample2), N_marks)).

    period : array_like, optional
        Length-3 sequence defining the periodic boundary conditions
        in each dimension. If you instead provide a single scalar, Lbox,
        period is assumed to be the same in all Cartesian directions.
        If set to None (the default option), PBCs are set to infinity.
        Length units are comoving and assumed to be in Mpc/h, here and throughout Halotools.

    do_auto : boolean, optional
        Boolean determines whether the auto-correlation function will
        be calculated and returned. Default is True.

    do_cross : boolean, optional
        Boolean determines whether the cross-correlation function will
        be calculated and returned. Only relevant when ``sample2`` is also provided.
        Default is True for the case where ``sample2`` is provided, otherwise False.

    num_threads : int, optional
        Number of threads to use in calculation, where parallelization is performed
        using the python ``multiprocessing`` module. Default is 1 for a purely serial
        calculation, in which case a multiprocessing Pool object will
        never be instantiated. A string 'max' may be used to indicate that
        the pair counters should use all available cores on the machine.

    weight_func_id : int, optional
        Integer ID indicating which marking function should be used.
        See notes for a list of available marking functions.

    normalize_by : string, optional
        A string indicating how to normailze the weighted pair counts in the marked
        correlation function calculation.  Options are: 'random_marks' or 'number_counts'.
        See Notes for more detail.

    iterations : int, optional
        integer indicating the number of times to calculate the random weights,
        taking the mean of the outcomes.  Only applicable if ``normalize_by`` is set
        to 'random_marks'.  See Notes for further explanation.

    randomize_marks : array_like, optional
        Boolean array of length N_marks indicating which elements should be randomized
        when calculating the random weighted pair counts.  Default is [True]*N_marks.
        This parameter is only applicable if ``normalize_by`` is set to 'random_marks'.
        See Notes for more detail.

    seed : int, optional
        Random number seed used to shuffle the marks
        and to randomly downsample data, if applicable.
        Default is None, in which case downsampling and shuffling will be stochastic.

    Returns
    -------
    marked_correlation_function(s) : numpy.array
        *len(rbins)-1* length array containing the marked correlation function
        :math:`\mathcal{M}(r)` computed in each of the bins defined by ``rbins``.

        .. math::
            \mathcal{M}(r) \equiv \mathrm{WW}(r) / \mathrm{XX}(r),

        where :math:`\mathrm{WW}(r)` is the weighted number of pairs with separations
        equal to :math:`r`, and :math:`\mathrm{XX}(r)` is dependent on the choice of the
        ``normalize_by`` parameter.  If ``normalize_by`` is 'random_marks'
        :math:`XX \equiv \mathcal{RR}`, the weighted pair counts where the marks have
        been randomized marks.  If ``normalize_by`` is 'number_counts'
        :math:`XX \equiv DD`, the unweighted pair counts.
        See Notes for more detail.

        If ``sample2`` is passed as input, three arrays of length *len(rbins)-1* are
        returned:

        .. math::
            \mathcal{M}_{11}(r), \ \mathcal{M}_{12}(r), \ \mathcal{M}_{22}(r),

        the autocorrelation of ``sample1``, the cross-correlation between ``sample1`` and
        ``sample2``, and the autocorrelation of ``sample2``.  If ``do_auto`` or
        ``do_cross`` is set to False, the appropriate result(s) is not returned.

    Notes
    -----
    Pairs are counted using
    `~halotools.mock_observables.pair_counters.marked_npairs_3d`.

    If the ``period`` argument is passed in, the ith coordinate of all points
    must be between 0 and period[i].

    ``normalize_by`` indicates how to calculate :math:`\mathrm{XX}`.  If ``normalize_by``
    is 'random_marks', then :math:`\mathrm{XX} \equiv \mathcal{RR}`, and
    :math:`\mathcal{RR}` is calculated by randomizing the marks among points according
    to the ``randomize_marks`` mask.  This marked correlation function is then:

    .. math::
        \mathcal{M}(r) \equiv \frac{\sum_{ij}f(m_i,m_j)}{\sum_{kl}f(m_k,m_l)}

    where the sum in the numerator is of pairs :math:`i,j` with separation :math:`r`,
    and marks :math:`m_i,m_j`.  :math:`f()` is the marking function, ``weight_func_id``.  The sum
    in the denominator is over an equal number of random pairs :math:`k,l`. The
    calculation of this sum can be done multiple times, by setting the ``iterations``
    parameter. The mean of the sum is then taken amongst iterations and used in the
    calculation.

    If ``normalize_by`` is 'number_counts', then :math:`\mathrm{XX} \equiv \mathrm{DD}`
    is calculated by counting total number of pairs using
    `~halotools.mock_observables.pair_counters.npairs_3d`.
    This is:

    .. math::
        \mathcal{M}(r) \equiv \frac{\sum_{ij}f(m_i,m_j)}{\sum_{ij} 1},

    There are multiple marking functions available.  In general, each requires a different
    number of marks per point, N_marks.  The marking function gets passed two vectors
    per pair, w1 and w2, of length N_marks and return a float.  The available marking
    functions, ``weight_func_id`` and the associated integer ID numbers are:

    #. multiplicaitive weights (N_marks = 1)
        .. math::
            f(w_1,w_2) = w_1[0] \times w_2[0]

    #. summed weights (N_marks = 1)
        .. math::
            f(w_1,w_2) = w_1[0] + w_2[0]

    #. equality weights (N_marks = 2)
        .. math::
            f(w_1,w_2) =
                \left \{
                \begin{array}{ll}
                    w_1[1]\times w_2[1] & : w_1[0] = w_2[0] \\
                    0.0 & : w_1[0] \neq w_2[0] \\
                \end{array}
                \right.

    #. inequality weights (N_marks = 2)
        .. math::
            f(w_1,w_2) =
                \left \{
                \begin{array}{ll}
                    w_1[1]\times w_2[1] & : w_1[0] \neq w_2[0] \\
                    0.0 & : w_1[0] = w_2[0] \\
                \end{array}
                \right.

    #. greater than weights (N_marks = 2)
        .. math::
            f(w_1,w_2) =
                \left \{
                \begin{array}{ll}
                    w_1[1]\times w_2[1] & : w_2[0] > w_1[0] \\
                    0.0 & : w_2[0] \leq w_1[0] \\
                \end{array}
                \right.

    #. less than weights (N_marks = 2)
        .. math::
            f(w_1,w_2) =
                \left \{
                \begin{array}{ll}
                    w_1[1]\times w_2[1] & : w_2[0] < w_1[0] \\
                    0.0 & : w_2[0] \geq w_1[0] \\
                \end{array}
                \right.

    #. greater than tolerance weights (N_marks = 2)
        .. math::
            f(w_1,w_2) =
                \left \{
                \begin{array}{ll}
                    w_2[1] & : w_2[0]>(w_1[0]+w_1[1]) \\
                    0.0 & : w_2[0] \leq (w_1[0]+w_1[1]) \\
                \end{array}
                \right.

    #. less than tolerance weights (N_marks = 2)
        .. math::
            f(w_1,w_2) =
                \left \{
                \begin{array}{ll}
                    w_2[1] & : w_2[0]<(w_1[0]+w_1[1]) \\
                    0.0 & : w_2[0] \geq (w_1[0]+w_1[1]) \\
                \end{array}
                \right.

    #. tolerance weights (N_marks = 2)
        .. math::
            f(w_1,w_2) =
                \left \{
                \begin{array}{ll}
                    w_2[1] & : |w_1[0]-w_2[0]|<w_1[1] \\
                    0.0 & : |w_1[0]-w_2[0]| \geq w_1[1] \\
                \end{array}
                \right.

    #. exclusion weights (N_marks = 2)
        .. math::
            f(w_1,w_2) =
                \left \{
                \begin{array}{ll}
                    w_2[1] & : |w_1[0]-w_2[0]|>w_1[1] \\
                    0.0 & : |w_1[0]-w_2[0]| \leq w_1[1] \\
                \end{array}
                \right.

    #. ratio weights (N_marks = 2)
        .. math::
            f(w_1,w_2) =
                \left \{
                \begin{array}{ll}
                    w_2[1] & : w2[0] > w1[0]*w1[1] \\
                    0.0 & : otherwise \\
                \end{array}
                \right.

    Examples
    --------
    For demonstration purposes we create a randomly distributed set of points within a
    periodic unit cube.

    >>> Npts = 1000
    >>> Lbox = 1.0
    >>> period = np.array([Lbox,Lbox,Lbox])

    >>> x = np.random.random(Npts)
    >>> y = np.random.random(Npts)
    >>> z = np.random.random(Npts)

    We transform our *x, y, z* points into the array shape used by the function by
    taking the transpose of the result of `numpy.vstack`. This boilerplate transformation
    is used throughout the `~halotools.mock_observables` sub-package:

    >>> coords = np.vstack((x,y,z)).T

    Assign random floats in the range [0,1] to the points to use as the marks:

    >>> marks = np.random.random(Npts)

    Use the multiplicative marking function:

    >>> rbins = np.logspace(-2,-1,10)
    >>> MCF = marked_tpcf(coords, rbins, marks1=marks, period=period, normalize_by='number_counts', weight_func_id=1)

    The result should be consistent with :math:`\langle {\rm mark}\rangle^2` at all *r*
    within the statistical errors.
    """

    # process parameters
    function_args = (sample1, rbins, sample2, marks1, marks2,
        period, do_auto, do_cross, num_threads,
        weight_func_id, normalize_by, iterations, randomize_marks, seed)
    sample1, rbins, sample2, marks1, marks2, period, do_auto, do_cross, num_threads,\
        weight_func_id, normalize_by, _sample1_is_sample2, PBCs,\
        randomize_marks = _marked_tpcf_process_args(*function_args)

    # calculate marked pairs
    W1W1, W1W2, W2W2 = marked_pair_counts(sample1, sample2, rbins, period,
        num_threads, do_auto, do_cross, marks1, marks2, weight_func_id, _sample1_is_sample2)

    if normalize_by == 'number_counts':
        R1R1, R1R2, R2R2 = pair_counts(sample1, sample2, rbins, period,
            num_threads, do_auto, do_cross, _sample1_is_sample2, None, None)
    # calculate randomized marked pairs
    elif normalize_by == 'random_marks':
        if iterations > 1:
            # create storage arrays of the right shape
            R1R1 = np.zeros((iterations, len(rbins)-1))
            R1R2 = np.zeros((iterations, len(rbins)-1))
            R2R2 = np.zeros((iterations, len(rbins)-1))
            for i in range(iterations):
                # get arrays to randomize marks
                with NumpyRNGContext(seed):
                    permutate1 = np.random.permutation(np.arange(0, len(sample1)))
                    permutate2 = np.random.permutation(np.arange(0, len(sample2)))
                R1R1[i, :], R1R2[i, :], R2R2[i, :] = random_counts(
                    sample1, sample2, rbins, period, num_threads,
                    do_auto, do_cross, marks1, marks2, weight_func_id,
                    _sample1_is_sample2, permutate1, permutate2, randomize_marks)

            R1R1 = np.median(R1R1, axis=0)
            R1R2 = np.median(R1R2, axis=0)
            R2R2 = np.median(R2R2, axis=0)
        else:
            # get arrays to randomize marks
            with NumpyRNGContext(seed):
                permutate1 = np.random.permutation(np.arange(0, len(sample1)))
                permutate2 = np.random.permutation(np.arange(0, len(sample2)))
            R1R1, R1R2, R2R2 = random_counts(sample1, sample2, rbins, period,
                num_threads, do_auto, do_cross, marks1, marks2, weight_func_id,
                _sample1_is_sample2, permutate1, permutate2, randomize_marks)

    # return results
    if _sample1_is_sample2:
        M_11 = W1W1/R1R1
        return M_11
    else:
        if (do_auto is True) & (do_cross is True):
            M_11 = W1W1/R1R1
            M_12 = W1W2/R1R2
            M_22 = W2W2/R2R2
            return M_11, M_12, M_22
        elif (do_cross is True):
            M_12 = W1W2/R1R2
            return M_12
        elif (do_auto is True):
            M_11 = W1W1/R1R1
            M_22 = W2W2/R2R2
            return M_11, M_22



def marked_pair_counts(sample1, sample2, rbins, period, num_threads,
        do_auto, do_cross, marks1, marks2, weight_func_id, _sample1_is_sample2):
    """
    Count weighted data pairs.
    """

    if do_auto is True:
        D1D1 = marked_npairs_3d(sample1, sample1, rbins,
            weights1=marks1, weights2=marks1,
            weight_func_id=weight_func_id, period=period, num_threads=num_threads)
        D1D1 = np.diff(D1D1)
    else:
        D1D1 = None
        D2D2 = None

    if _sample1_is_sample2:
        D1D2 = D1D1
        D2D2 = D1D1
    else:
        if do_cross is True:
            D1D2 = marked_npairs_3d(sample1, sample2, rbins,
                weights1=marks1, weights2=marks2, weight_func_id=weight_func_id,
                period=period, num_threads=num_threads)
            D1D2 = np.diff(D1D2)
        else:
            D1D2 = None
        if do_auto is True:
            D2D2 = marked_npairs_3d(sample2, sample2, rbins,
                weights1=marks2, weights2=marks2, weight_func_id=weight_func_id,
                period=period, num_threads=num_threads)
            D2D2 = np.diff(D2D2)
        else:
            D2D2 = None

    return D1D1, D1D2, D2D2


def random_counts(sample1, sample2, rbins, period, num_threads,
        do_auto, do_cross, marks1, marks2, weight_func_id,
        _sample1_is_sample2, permutate1, permutate2, randomize_marks):
    """
    Count random weighted data pairs.
    """

    permuted_marks1 = marks1
    permuted_marks2 = marks2
    for i in range(marks1.shape[1]):
        if randomize_marks[i]:
            permuted_marks1[:, i] = marks1[permutate1, i]
    for i in range(marks2.shape[1]):
        if randomize_marks[i]:
            permuted_marks2[:, i] = marks2[permutate2, i]

    if do_auto is True:
        R1R1 = marked_npairs_3d(sample1, sample1, rbins,
            weights1=marks1, weights2=permuted_marks1,
            weight_func_id=weight_func_id, period=period, num_threads=num_threads)
        R1R1 = np.diff(R1R1)
    else:
        R1R1 = None
        R2R2 = None

    if _sample1_is_sample2:
        R1R2 = R1R1
        R2R2 = R1R1
    else:
        if do_cross is True:
            R1R2 = marked_npairs_3d(sample1, sample2, rbins,
                weights1=permuted_marks1,
                weights2=permuted_marks2,
                weight_func_id=weight_func_id, period=period, num_threads=num_threads)
            R1R2 = np.diff(R1R2)
        else:
            R1R2 = None
        if do_auto is True:
            R2R2 = marked_npairs_3d(sample2, sample2, rbins,
                weights1=marks2, weights2=permuted_marks2,
                weight_func_id=weight_func_id, period=period, num_threads=num_threads)
            R2R2 = np.diff(R2R2)
        else:
            R2R2 = None

    return R1R1, R1R2, R2R2


def pair_counts(sample1, sample2, rbins, period, num_threads, do_auto, do_cross,
        _sample1_is_sample2, approx_cell1_size, approx_cell2_size):
    """
    Count data-data pairs.
    """
    if do_auto is True:
        D1D1 = npairs_3d(sample1, sample1, rbins, period=period, num_threads=num_threads,
                      approx_cell1_size=approx_cell1_size,
                      approx_cell2_size=approx_cell1_size)
        D1D1 = np.diff(D1D1)
    else:
        D1D1 = None
        D2D2 = None

    if _sample1_is_sample2:
        D1D2 = D1D1
        D2D2 = D1D1
    else:
        if do_cross is True:
            D1D2 = npairs_3d(sample1, sample2, rbins, period=period,
                          num_threads=num_threads,
                          approx_cell1_size=approx_cell1_size,
                          approx_cell2_size=approx_cell2_size)
            D1D2 = np.diff(D1D2)
        else:
            D1D2 = None
        if do_auto is True:
            D2D2 = npairs_3d(sample2, sample2, rbins, period=period,
                          num_threads=num_threads,
                          approx_cell1_size=approx_cell2_size,
                          approx_cell2_size=approx_cell2_size)
            D2D2 = np.diff(D2D2)
        else:
            D2D2 = None

    return D1D1, D1D2, D2D2


def _marked_tpcf_process_args(sample1, rbins, sample2, marks1, marks2,
        period, do_auto, do_cross, num_threads,
        wfunc, normalize_by, iterations, randomize_marks, seed):
    """
    Private method to do bounds-checking on the arguments passed to
    `~halotools.mock_observables.marked_tpcf`.
    """

    sample1 = enforce_sample_has_correct_shape(sample1)

    sample2, _sample1_is_sample2, do_cross = process_optional_input_sample2(
        sample1, sample2, do_cross)

    # process wfunc parameter
    try:
        int(wfunc) == wfunc
    except:
        msg = ("\n `wfunc` parameter must be an integer ID of the desired function.")
        raise ValueError(msg)

    # process normalize_by parameter
    if normalize_by not in ['random_marks', 'number_counts']:
        msg = ("\n `normalize_by` parameter not recognized.")
        raise ValueError(msg)

    # process marks
    if marks1 is not None:
        marks1 = np.atleast_1d(marks1).astype(float)
    else:
        marks1 = np.ones(len(sample1)).astype(float)
    if marks2 is not None:
        marks2 = np.atleast_1d(marks2).astype(float)
    else:
        marks2 = np.ones(len(sample2)).astype(float)

    if marks1.ndim == 1:
        npts1 = len(marks1)
        marks1 = marks1.reshape((npts1, 1))
    elif marks1.ndim == 2:
        pass
    else:
        ndim1 = marks1.ndim
        msg = ("\n You must either pass in a 1-D or 2-D array \n"
               "for the input `marks1`. \n"
               "The `pair_counters._wnpairs_process_weights` function received \n"
               "a `marks1` array of dimension %i")
        raise HalotoolsError(msg % ndim1)

    if marks2.ndim == 1:
        npts2 = len(marks2)
        marks2 = marks2.reshape((npts2, 1))
    elif marks2.ndim == 2:
        pass
    else:
        ndim2 = marks2.ndim
        msg = ("\n You must either pass in a 1-D or 2-D array \n"
               "for the input `marks2`. \n"
               "The `pair_counters._wnpairs_process_weights` function received \n"
               "a `marks2` array of dimension %i")
        raise HalotoolsError(msg % ndim2)

    # check for consistency between marks and samples
    if len(marks1) != len(sample1):
        msg = ("\n `marks1` must have same length as `sample1`.")
        raise HalotoolsError(msg)
    if len(marks2) != len(sample2):
        msg = ("\n `marks2` must have same length as `sample2`.")
        raise HalotoolsError(msg)

    if randomize_marks is not None:
        randomize_marks = np.atleast_1d(randomize_marks)
    else:
        randomize_marks = np.array([True]*marks1.shape[1])

    if randomize_marks.ndim == 1:
        if len(randomize_marks) != marks1.shape[1]:
            msg = ("\n `randomize_marks` must have same length \n"
                   " as the number of weights per point.")
            raise HalotoolsError(msg)
    else:
        msg = ("\n `randomize_marks` must be one dimensional.")
        raise HalotoolsError(msg)

    rbins = get_separation_bins_array(rbins)
    rmax = np.amax(rbins)

    period, PBCs = get_period(period)

    _enforce_maximum_search_length(rmax, period)

    try:
        assert do_auto == bool(do_auto)
        assert do_cross == bool(do_cross)
    except:
        msg = "`do_auto` and `do_cross` keywords must be boolean-valued."
        raise ValueError(msg)

    num_threads = get_num_threads(num_threads)

    return sample1, rbins, sample2, marks1, marks2, period, do_auto, do_cross,\
        num_threads, wfunc, normalize_by, _sample1_is_sample2, PBCs, randomize_marks
Navigation

Source code for halotools.mock_observables.two_point_clustering.marked_tpcf