Source code for halotools.utils.distribution_matching

""" Module storing distribution_matching_indices used to generate
an indexing array that matches one distribution against another.
"""
import numpy as np
from astropy.utils.misc import NumpyRNGContext


__all__ = ('distribution_matching_indices', 'resample_x_to_match_y',
        'bijective_distribution_matching')



[docs]
def distribution_matching_indices(input_distribution, output_distribution,
        nselect, bins, seed=None):
    """ Calcuate a set of indices that will resample (with replacement)
    ``input_distribution`` so that it matches ``output_distribution``.

    This function is useful, for example, for comparing a pair of samples
    with matching stellar mass functions.

    Parameters
    ----------
    input_distribution : ndarray
        Numpy array of shape (npts1, ) storing the distribution that requires modification

    output_distribution : ndarray
        Numpy array of shape (npts2, ) defining the desired output distribution

    nselect : int
        Number of points to select from ``input_distribution``.

    bins : ndarray
        Binning used to estimate the PDFs. Default is 100 bins automatically
        determined by `numpy.histogram`.

    seed : int, optional
        Random number seed used to generate indices.
        Default is None for stochastic results.

    Returns
    -------
    indices : ndarray
        Numpy array of shape (nselect, ) storing indices ranging from [0, npts1)
        such that ``input_distribution[indices]`` will have a PDF that matches the PDF
        of ``output_distribution``.

    Notes
    -----
    Pay careful attention that your bins are appropriate for your two distributions.
    The PDF of the returned result will only match the ``output_distribution`` PDF
    tabulated in the input ``bins``. Depending on the two distributions and your
    choice of bins, may not be possible to construct matching PDFs
    if your sampling is too sparse or your bins are inappropriate.

    Examples
    --------
    >>> npts1, npts2 = int(1e5), int(1e4)
    >>> input_distribution = np.random.normal(loc=0, scale=1, size=npts1)
    >>> output_distribution = np.random.normal(loc=.5, scale=0.5, size=npts2)
    >>> nselect = int(2e4)
    >>> bins = np.linspace(-2, 2, 50)
    >>> indices = distribution_matching_indices(input_distribution, output_distribution, nselect, bins)

    .. image:: /_static/matched_distributions.png

    """
    hist2, bins = np.histogram(output_distribution, density=True, bins=bins)
    hist1 = np.histogram(input_distribution, bins=bins, density=True)[0].astype(float)

    hist_ratio = np.zeros_like(hist2, dtype=float)
    hist_ratio[hist1 > 0] = hist2[hist1 > 0]/hist1[hist1 > 0]

    bin_mids = 0.5*(bins[:-1] + bins[1:])
    hist_ratio_interp = np.interp(input_distribution, bin_mids, hist_ratio)
    prob_select = hist_ratio_interp/float(hist_ratio_interp.sum())

    candidate_indices = np.arange(len(input_distribution))
    with NumpyRNGContext(seed):
        indices = np.random.choice(candidate_indices, size=nselect, replace=True, p=prob_select)
    return indices




[docs]
def resample_x_to_match_y(x, y, bins, seed=None):
    """ Return the indices that resample `x` (with replacement) so that the
    resampled distribution matches the histogram of `y`.
    The returned indexing array will be sorted so that
    the i^th element of x[idx] is as close as possible to the
    i^th value of x, subject to the the constraint that x[idx] matches y.

    Parameters
    ----------
    x : ndarray
        Numpy array of shape (nx, )

    y : ndarray
        Numpy array of shape (ny, )

    bins : ndarray
        Numpy array of shape (nbins, ) defining how the distribution `y`
        will be binned to evaluate its PDF.

    seed : int, optional
        Random number seed used to generate indices.
        Default is None for stochastic results.

    Returns
    -------
    indices : ndarray
        Numpy array of shape (nx, )

    Examples
    --------
    >>> nx, ny = int(1e5), int(1e4)
    >>> x = np.random.normal(loc=0, size=nx, scale=1)
    >>> y = np.random.normal(loc=1, size=ny, scale=0.5)
    >>> bins = np.linspace(-5, 5, 100)
    >>> indices = resample_x_to_match_y(x, y, bins)
    >>> rescaled_x = x[indices]
    """
    nselect = len(x)
    idx = distribution_matching_indices(x, y, nselect, bins, seed=seed)
    xnew = x[idx]
    idx_sorted_xnew = np.argsort(xnew)
    idx_sorted_x = np.argsort(x)
    indices = np.empty_like(x).astype(int)
    indices[idx_sorted_x] = idx[idx_sorted_xnew]
    return indices




[docs]
def bijective_distribution_matching(x_in, x_desired):
    """ Replace the values in ``x_in`` with ``x_desired``, preserving the rank-order of ``x_in``

    Parameters
    ----------
    x_in : ndarray
        Numpy array of shape (npts, )

    x_desired : ndarray
        Numpy array of shape (npts, )

    Returns
    -------
    x_out : ndarray
        Numpy array of shape (npts, )

    Examples
    --------
    >>> npts = int(1e5)
    >>> x_in = np.random.normal(loc=0, scale=0.5, size=npts)
    >>> x_desired = np.random.normal(loc=2, scale=1, size=npts)
    >>> x_out = bijective_distribution_matching(x_in, x_desired)

    In the figure below, the left hand panel shows that the output distribution
    is in exact agreement with the desired distribution. The right hand panel
    shows that the rank-order of the input distribution is preserved.

    .. image:: /_static/bijective_distribution_matching_demo.png
    """
    x_in = np.atleast_1d(x_in)
    x_desired = np.atleast_1d(x_desired)
    x_out = np.zeros_like(x_in)
    idx_sorted = np.argsort(x_in)
    x_out[idx_sorted] = np.sort(x_desired)
    return x_out
Navigation

Source code for halotools.utils.distribution_matching