Source code for halotools.utils.inverse_transformation_sampling

""" Core functions used as kernels of inverse transformation sampling
"""
import numpy as np
from astropy.utils import NumpyRNGContext
from warnings import warn
import warnings

from .array_utils import unsorting_indices


__all__ = ('monte_carlo_from_cdf_lookup', 'build_cdf_lookup', 'rank_order_percentile')

warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")


[docs] def monte_carlo_from_cdf_lookup(x_table, y_table, mc_input='random', num_draws=None, seed=None): """ Randomly draw a set of ``num_draws`` points from any arbitrary input distribution function. The input distribution is specified in terms of its CDF, which is defined by the values of the input ``y_table`` of the CDF evaluated at an input set of control points ``x_table``. The input ``x_table`` and ``y_table`` can be calculated from any input dataset using the function `build_cdf_lookup`. Parameters ---------- x_table : ndarray Numpy array of shape (npts, ) providing the control points at which the input CDF has been tabulated. y_table : ndarray Numpy array of shape (npts, ) providing the values of the random variable associated with each input control point. mc_input : ndarray, optional Input array of shape (num_desired_draws, ) storing values between 0 and 1 specifying the values of the CDF for which associated values of ``y`` are desired. If ``mc_input`` is left unspecified, the ``num_draws`` must be specified; in this case, the CDF will be randomly sampled. num_draws : int, optional Desired number of random draws from the input CDF. ``num_draws`` must be specified if ``mc_input`` is left unspecified. seed : int, optional Random number seed used in the Monte Carlo realization. Returns -------- mc_realization : ndarray Length-num_draws array of random draws from the input CDF. Notes ----- See the `Transformation of Probability tutorial <https://github.com/jbailinua/probability/>`_ for pedagogical derivations associated with inverse transformation sampling. Examples -------- In this first example, we'll start by creating some fake data, ``y``, drawn from a weighted combination of a Gaussian and a power law. We'll think of the fake data ``y`` as if it came from some external dataset that we want to model, treating the fake data as our input distribution. We will use the `build_cdf_lookup` function to build the necessary lookup tables ``x_table`` and ``y_table``, and then use the `monte_carlo_from_cdf_lookup` function to generate a Monte Carlo realization of the data ``y``. >>> npts = int(1e4) >>> y = 0.1*np.random.normal(size=npts) + 0.9*np.random.power(2, size=npts) >>> x_table, y_table = build_cdf_lookup(y) >>> result = monte_carlo_from_cdf_lookup(x_table, y_table, num_draws=5000) The returned array ``result`` is a stochastic Monte Carlo realization of the distribution specified by ``y``. .. image:: /_static/monte_carlo_example.png Now let's consider a second example where, rather than specifying an integer number of purely random Monte Carlo draws, instead we pass in an array of uniform random numbers. >>> uniform_randoms = np.random.rand(npts) >>> result2 = monte_carlo_from_cdf_lookup(x_table, y_table, mc_input=uniform_randoms) This alternative call to `monte_carlo_from_cdf_lookup` provides completely equivalent results as the first, because we have passed in uniform randoms. Since ``uniform_randoms`` just stores random values, then random values from the input distribution will be returned. However, this alternative form of input can be exploited for other applications. Suppose that instead of purely random draws, you wish to draw from the input distribution ``y`` in such a way that you introduce a correlation between the drawn values and the values stored in some other array, ``x``. You can accomplish this by passing in the rank-order percentile values of ``x`` instead of uniform randoms. This is the basis of the conditional abundance matching technique implemented by the `~halotools.empirical_models.conditional_abunmatch_bin_based` function. To see an example of how this works, let's create some fake data for some property *x* that we wish to model as being correlated with Monte Carlo realizations of *y* while preserving the PDF of the realization: >>> x = np.sort(10**np.random.normal(loc=10, size=5000)) >>> x_percentile = (1. + np.arange(len(x)))/float(len(x)+1) >>> correlated_result = monte_carlo_from_cdf_lookup(x_table, y_table, mc_input=x_percentile) >>> uniform_randoms = np.random.rand(npts) >>> uncorrelated_result = monte_carlo_from_cdf_lookup(x_table, y_table, mc_input=uniform_randoms) We can use the `~halotools.empirical_models.noisy_percentile` to introduce variable levels of noise in the correlation. >>> from halotools.empirical_models import noisy_percentile >>> noisy_x_percentile = noisy_percentile(x_percentile, correlation_coeff=0.75) >>> weakly_correlated_result = monte_carlo_from_cdf_lookup(x_table, y_table, mc_input=noisy_x_percentile) .. image:: /_static/monte_carlo_example2.png """ if mc_input is 'random': if num_draws is None: msg = ("If input ``mc_input`` is set to ``random``, \n" "``num_draws`` must be specified") raise ValueError(msg) with NumpyRNGContext(seed): mc_input = np.random.rand(num_draws) else: if num_draws is not None: msg = ("If input ``mc_input`` is specified, \n" "the ``num_draws`` keyword should be left unspecified.") raise ValueError(msg) return np.interp(np.atleast_1d(mc_input), x_table, y_table)
[docs] def build_cdf_lookup(y, npts_lookup_table=1000): r""" Compute a lookup table for the cumulative distribution function specified by the input set of ``y`` values. The input data ``y`` will be used to define the CDF P(< y) in the usual way: the array ``y`` will be sorted, and the largest value corresponds to CDF value 1/npts_data, the second largest value 2/npts_data, etc. For performance reasons, this correspondence will be used to build a sparse lookup table of length ``npts_lookup_table``. The accuracy of the returned CDF is fundamentally limited by npts_data, and optionally limited by npts_lookup_table. Parameters ---------- y : ndarray Numpy array of shape (npts_data, ) defining the distribution function of the returned lookup table. npts_lookup_table : int, optional Number of control points in the returned lookup table. Cannot exceed npts_data. Returns ------- x_table : ndarray Numpy array of shape (npts_lookup_table, ) storing the control points at which the CDF has been evaluated. y_table : ndarray Numpy array of shape (npts_lookup_table, ) storing the values of the random variable associated with each control point in the ``x_table``. Examples -------- >>> y = np.random.normal(size=int(1e5)) >>> x_table, y_table = build_cdf_lookup(y, npts_lookup_table=100) """ y = np.atleast_1d(y) assert len(y) > 1, "Input ``y`` has only one element" npts_y = len(y) if npts_y < npts_lookup_table: warning_msg = ("The build_cdf_lookup function was called with the (optional) ``npts_lookup_table`` " "argument set to {0}.\n" "However, the number of data points in your data table npts_y = {1}.\n" "The default behavior in this situation is to overwrite ``npts_lookup_table`` = ``npts_y``,\n" "so that every point in the data set is used to build the lookup table.") warn(warning_msg.format(npts_lookup_table, npts_y)) npts_lookup_table = max(npts_lookup_table, npts_y) sorted_y = np.sort(y) normed_rank_order = _sorted_rank_order_array(npts_y) x_table = _sorted_rank_order_array(npts_lookup_table) y_table = np.interp(x_table, normed_rank_order, sorted_y) return x_table, y_table
[docs] def rank_order_percentile(y): """ Return the rank-order percentile of the values of an input distribution ``y``. Parameters ---------- y : ndarray Numpy array of shape (npts, ) storing an input distribution of values Returns ------- ranks : ndarray Numpy array of shape (npts, ) storing the rank-order percentile of every value of the input ``y``. Examples -------- >>> percentile = rank_order_percentile(np.random.normal(size=int(1e2))) """ idx_sorted = np.argsort(y) return _sorted_rank_order_array(len(y))[unsorting_indices(idx_sorted)]
def _sorted_rank_order_array(npts): """ Return the length-npts array [1/npts+1, 2/npts+1, ... npts/npts+1]. Values are linearly spaced in ascending order respecting the strict inequalities 0 < x < 1. Notes ----- This function is typically used to calculate the control points of a CDF. """ return np.arange(1, npts+1)/float(npts+1)