Source code for halotools.sim_manager.cached_halo_catalog

""" Module storing the `~halotools.sim_manager.CachedHaloCatalog`,
the class responsible for retrieving halo catalogs from shorthand
keyword inputs such as ``simname`` and ``redshift``.
"""
import os
from warnings import warn
from copy import deepcopy
import numpy as np

from astropy.table import Table
from ..utils.python_string_comparisons import _passively_decode_string, compare_strings_py23_safe

try:
    import h5py
    _HAS_H5PY = True
except ImportError:
    _HAS_H5PY = False
    warn("Most of the functionality of the "
        "sim_manager sub-package requires h5py to be installed,\n"
        "which can be accomplished either with pip or conda. ")

from ..sim_manager import sim_defaults, supported_sims

from ..utils import broadcast_host_halo_property, add_halo_hostid

from .halo_table_cache import HaloTableCache
from .ptcl_table_cache import PtclTableCache
from .halo_table_cache_log_entry import get_redshift_string

from ..custom_exceptions import HalotoolsError, InvalidCacheLogEntry


__all__ = ('CachedHaloCatalog', )


[docs] class CachedHaloCatalog(object): """ Container class for the halo catalogs and particle data that are stored in the Halotools cache log. `CachedHaloCatalog` is used to retrieve halo catalogs from shorthand keyword inputs such as ``simname``, ``halo_finder`` and ``redshift``. The halos are stored in the ``halo_table`` attribute in the form of an Astropy `~astropy.table.Table`. If available, another `~astropy.table.Table` storing a random downsampling of dark matter particles is stored in the ``ptcl_table`` attribute. See the Examples section below for details on how to access and manipulate this data. For a list of available snapshots provided by Halotools, see :ref:`supported_sim_list`. For information about the subhalo vs. host halo nomenclature conventions used throughout Halotools, see :ref:`rockstar_subhalo_nomenclature`. For a thorough discussion of the meaning of each column in the Rockstar halo catalogs, see the appendix of `Rodriguez Puebla et al 2016 <http://arxiv.org/abs/1602.04813>`_. """ acceptable_kwargs = ('ptcl_version_name', 'fname', 'simname', 'halo_finder', 'redshift', 'version_name', 'dz_tol', 'update_cached_fname', 'preload_halo_table') def __init__(self, *args, **kwargs): """ Parameters ------------ simname : string, optional Nickname of the simulation used as a shorthand way to keep track of the halo catalogs in your cache. The simnames of the Halotools-provided catalogs are 'bolshoi', 'bolplanck', 'consuelo' and 'multidark'. Default is set by the ``default_simname`` variable in the `~halotools.sim_manager.sim_defaults` module. halo_finder : string, optional Nickname of the halo-finder used to generate the hlist file from particle data. Default is set by the ``default_halo_finder`` variable in the `~halotools.sim_manager.sim_defaults` module. redshift : float, optional Redshift of the halo catalog. Default is set by the ``default_redshift`` variable in the `~halotools.sim_manager.sim_defaults` module. version_name : string, optional Nickname of the version of the halo catalog. Default is set by the ``default_version_name`` variable in the `~halotools.sim_manager.sim_defaults` module. ptcl_version_name : string, optional Nicknake of the version of the particle catalog associated with the halos. This argument is typically only used if you have cached your own particles via the `~halotools.sim_manager.UserSuppliedPtclCatalog` class. Default is set by the ``default_version_name`` variable in the `~halotools.sim_manager.sim_defaults` module. fname : string, optional Absolute path to the location on disk storing the hdf5 file of halo data. If passing ``fname``, do not pass the metadata keys ``simname``, ``halo_finder``, ``version_name`` or ``redshift``. update_cached_fname : bool, optional If the hdf5 file storing the halos has been relocated to a new disk location after storing the data in cache, the ``update_cached_fname`` input can be used together with the ``fname`` input to update the cache log with the new disk location. See :ref:`relocating_simulation_data_instructions` for further instructions. dz_tol : float, optional Tolerance within to search for a catalog with a matching redshift. Halo catalogs in cache with a redshift that differs by greater than ``dz_tol`` will be ignored. Default is 0.05. Examples --------- If you followed the instructions in the :ref:`download_default_halos` section of the :ref:`getting_started` guide, then you can load the default halo catalog into memory by calling the `~halotools.sim_manager.CachedHaloCatalog` with no arguments: >>> halocat = CachedHaloCatalog() # doctest: +SKIP The halos are stored in the ``halo_table`` attribute in the form of an Astropy `~astropy.table.Table`. >>> halos = halocat.halo_table # doctest: +SKIP As with any Astropy `~astropy.table.Table`, the properties of the halos can be accessed in the same manner as a Numpy structured array or python dictionary: >>> array_of_masses = halocat.halo_table['halo_mvir'] # doctest: +SKIP >>> x_positions = halocat.halo_table['halo_x'] # doctest: +SKIP Note that all keys of a cached halo catalog begin with the substring ``halo_``. This is a bookkeeping device used to help the internals of Halotools differentiate between halo properties and the properties of mock galaxies populated into the halos with ambiguously similar names. The ``simname``, ``halo_finder``, ``version_name`` and ``redshift`` keyword arguments fully specify the halo catalog that will be loaded. Omitting any of them will select the corresponding default value set in the `~halotools.sim_manager.sim_defaults` module. >>> halocat = CachedHaloCatalog(redshift = 1, simname = 'multidark') # doctest: +SKIP If you forget which catalogs you have stored in cache, you have two options for how to remind yourself. First, you can use the `~halotools.sim_manager.HaloTableCache` class: >>> from halotools.sim_manager import HaloTableCache >>> cache = HaloTableCache() >>> for entry in cache.log: print(entry) # doctest: +SKIP Alternatively, you can simply use a text editor to open the cache log, which is stored as ASCII data in the following location on your machine: $HOME/.astropy/cache/halotools/halo_table_cache_log.txt See also ---------- :ref:`halo_catalog_analysis_quickstart` :ref:`halo_catalog_analysis_tutorial` """ self._verify_acceptable_constructor_call(*args, **kwargs) assert _HAS_H5PY, "Must have h5py package installed to use CachedHaloCatalog objects" try: dz_tol = kwargs['dz_tol'] except KeyError: dz_tol = 0.05 self._dz_tol = dz_tol try: update_cached_fname = kwargs['update_cached_fname'] except KeyError: update_cached_fname = False self._update_cached_fname = update_cached_fname self.halo_table_cache = HaloTableCache() self._disallow_catalogs_with_known_bugs(**kwargs) self.log_entry = self._determine_cache_log_entry(**kwargs) self.simname = self.log_entry.simname self.halo_finder = self.log_entry.halo_finder self.version_name = self.log_entry.version_name self.redshift = self.log_entry.redshift self.fname = self.log_entry.fname self._bind_additional_metadata() try: preload_halo_table = kwargs['preload_halo_table'] except KeyError: preload_halo_table = False if preload_halo_table is True: _ = self.halo_table del _ self._set_publication_list(self.simname) def _set_publication_list(self, simname): try: simclass = supported_sims.supported_sim_dict[simname] simobj = simclass() self.publications = simobj.publications except (KeyError, AttributeError): self.publications = [] def _verify_acceptable_constructor_call(self, *args, **kwargs): """ """ try: assert len(args) == 0 except AssertionError: msg = ("\nCachedHaloCatalog only accepts keyword arguments, not position arguments. \n") raise HalotoolsError(msg) for key in list(kwargs.keys()): try: assert key in self.acceptable_kwargs except AssertionError: msg = ("\nCachedHaloCatalog got an unexpected keyword ``" + key + "``\n" "The only acceptable keywords are listed below:\n\n") for acceptable_key in self.acceptable_kwargs: msg += "``" + acceptable_key + "``\n" raise HalotoolsError(msg) def _determine_cache_log_entry(self, **kwargs): """ """ try: self.ptcl_version_name = kwargs['ptcl_version_name'] self._default_ptcl_version_name_choice = False except KeyError: self.ptcl_version_name = sim_defaults.default_ptcl_version_name self._default_ptcl_version_name_choice = True if 'fname' in kwargs: fname = kwargs['fname'] if not os.path.isfile(fname): msg = ("\nThe ``fname`` you passed to the CachedHaloCatalog " "constructor is a non-existent path.\n") raise HalotoolsError(msg) try: assert 'simname' not in kwargs except AssertionError: msg = ("\nIf you specify an input ``fname``, " "do not also specify ``simname``.\n") raise HalotoolsError(msg) try: assert 'halo_finder' not in kwargs except AssertionError: msg = ("\nIf you specify an input ``fname``, " "do not also specify ``halo_finder``.\n") raise HalotoolsError(msg) try: assert 'redshift' not in kwargs except AssertionError: msg = ("\nIf you specify an input ``fname``, " "do not also specify ``redshift``.\n") raise HalotoolsError(msg) try: assert 'version_name' not in kwargs except AssertionError: msg = ("\nIf you specify an input ``fname``, " "do not also specify ``version_name``.\n") raise HalotoolsError(msg) return self._retrieve_matching_log_entry_from_fname(fname) else: try: simname = str(kwargs['simname']) self._default_simname_choice = False except KeyError: simname = sim_defaults.default_simname self._default_simname_choice = True try: halo_finder = str(kwargs['halo_finder']) self._default_halo_finder_choice = False except KeyError: halo_finder = sim_defaults.default_halo_finder self._default_halo_finder_choice = True try: version_name = str(kwargs['version_name']) self._default_version_name_choice = False except KeyError: version_name = sim_defaults.default_version_name self._default_version_name_choice = True try: redshift = float(kwargs['redshift']) self._default_redshift_choice = False except KeyError: redshift = sim_defaults.default_redshift self._default_redshift_choice = True return self._retrieve_matching_log_entry_from_metadata( simname, halo_finder, version_name, redshift) def _retrieve_matching_log_entry_from_fname(self, fname): """ """ log_entry = self.halo_table_cache.determine_log_entry_from_fname(fname, overwrite_fname_metadata=False) if not compare_strings_py23_safe(log_entry.fname, fname): if self._update_cached_fname is True: old_fname = deepcopy(log_entry.fname) log_entry = ( self.halo_table_cache.determine_log_entry_from_fname(fname, overwrite_fname_metadata=self._update_cached_fname) ) self.halo_table_cache.update_cached_file_location( fname, old_fname) else: msg = ("\nThe ``fname`` you passed as an input to the " "CachedHaloCatalog class \ndoes not match the ``fname`` " "stored as metadata in the hdf5 file.\n" "This means that at some point you manually relocated the catalog on disk \n" "after storing its location in cache, " "but you did not yet update the Halotools cache log. \n" "When possible, try to keep your halo catalogs " "at a fixed disk location \n" "as this helps ensure reproducibility. \n" "If the ``fname`` you passed to CachedHaloCatalog is the " "new location you want to store the catalog, \n" "then you can update the cache by calling the CachedHaloCatalog \n" "constructor again and setting the ``update_cached_fname`` variable to True.\n") raise HalotoolsError(msg) return log_entry def _retrieve_matching_ptcl_cache_log_entry(self): """ """ ptcl_table_cache = PtclTableCache() if len(ptcl_table_cache.log) == 0: msg = ("\nThe Halotools cache log has no record of any particle catalogs.\n" "If you have never used Halotools before, " "you should read the Getting Started guide on halotools.readthedocs.io.\n" "If you have previously used the package before, \n" "try running the halotools/scripts/rebuild_ptcl_table_cache_log.py script.\n") raise HalotoolsError(msg) gen0 = ptcl_table_cache.matching_log_entry_generator( simname=self.simname, version_name=self.ptcl_version_name, redshift=self.redshift, dz_tol=self._dz_tol) gen1 = ptcl_table_cache.matching_log_entry_generator( simname=self.simname, version_name=self.ptcl_version_name) gen2 = ptcl_table_cache.matching_log_entry_generator(simname=self.simname) matching_entries = list(gen0) msg = ("\nYou tried to load a cached particle catalog " "with the following characteristics:\n\n") if self._default_simname_choice is True: msg += ("simname = ``" + str(self.simname) + "`` (set by sim_defaults.default_simname)\n") else: msg += "simname = ``" + str(self.simname) + "``\n" if self._default_ptcl_version_name_choice is True: msg += ("ptcl_version_name = ``" + str(self.ptcl_version_name) + "`` (set by sim_defaults.default_version_name)\n") else: msg += "ptcl_version_name = ``" + str(self.ptcl_version_name) + "``\n" if self._default_redshift_choice is True: msg += ("redshift = ``" + str(self.redshift) + "`` (set by sim_defaults.default_redshift)\n") else: msg += "redshift = ``" + str(self.redshift) + "``\n" msg += ("\nThere is no matching catalog in cache " "within dz_tol = "+str(self._dz_tol)+" of these inputs.\n" ) if len(matching_entries) == 0: suggestion_preamble = ("\nThe following entries in the cache log " "most closely match your inputs:\n\n") alt_list1 = list(gen1) # discard the redshift requirement if len(alt_list1) > 0: msg += suggestion_preamble for entry in alt_list1: msg += str(entry) + "\n\n" else: alt_list2 = list(gen2) # discard the version_name requirement if len(alt_list2) > 0: msg += suggestion_preamble for entry in alt_list2: msg += str(entry) + "\n\n" else: msg += "There are no simulations matching your input simname.\n" raise InvalidCacheLogEntry(msg) elif len(matching_entries) == 1: log_entry = matching_entries[0] return log_entry else: msg += ("There are multiple entries in the cache log \n" "within dz_tol = "+str(self._dz_tol)+" of your inputs. \n" "Try using the exact redshift and/or decreasing dz_tol.\n" "Now printing the matching entries:\n\n") for entry in matching_entries: msg += str(entry) + "\n" raise InvalidCacheLogEntry(msg) def _retrieve_matching_log_entry_from_metadata(self, simname, halo_finder, version_name, redshift): """ """ if len(self.halo_table_cache.log) == 0: msg = ("\nThe Halotools cache log is empty.\n" "If you have never used Halotools before, " "you should read the Getting Started guide on halotools.readthedocs.io.\n" "If you have previously used the package before, \n" "try running the halotools/scripts/rebuild_halo_table_cache_log.py script.\n") raise HalotoolsError(msg) gen0 = self.halo_table_cache.matching_log_entry_generator( simname=simname, halo_finder=halo_finder, version_name=version_name, redshift=redshift, dz_tol=self._dz_tol) gen1 = self.halo_table_cache.matching_log_entry_generator( simname=simname, halo_finder=halo_finder, version_name=version_name) gen2 = self.halo_table_cache.matching_log_entry_generator( simname=simname, halo_finder=halo_finder) gen3 = self.halo_table_cache.matching_log_entry_generator( simname=simname) matching_entries = list(gen0) msg = ("\nYou tried to load a cached halo catalog " "with the following characteristics:\n\n") if self._default_simname_choice is True: msg += ("simname = ``" + str(simname) + "`` (set by sim_defaults.default_simname)\n") else: msg += "simname = ``" + str(simname) + "``\n" if self._default_halo_finder_choice is True: msg += ("halo_finder = ``" + str(halo_finder) + "`` (set by sim_defaults.default_halo_finder)\n") else: msg += "halo_finder = ``" + str(halo_finder) + "``\n" if self._default_version_name_choice is True: msg += ("version_name = ``" + str(version_name) + "`` (set by sim_defaults.default_version_name)\n") else: msg += "version_name = ``" + str(version_name) + "``\n" if self._default_redshift_choice is True: msg += ("redshift = ``" + str(redshift) + "`` (set by sim_defaults.default_redshift)\n") else: msg += "redshift = ``" + str(redshift) + "``\n" msg += ("\nThere is no matching catalog in cache " "within dz_tol = "+str(self._dz_tol)+" of these inputs.\n" ) if len(matching_entries) == 0: suggestion_preamble = ("\nThe following entries in the cache log " "most closely match your inputs:\n\n") alt_list1 = list(gen1) # discard the redshift requirement if len(alt_list1) > 0: msg += suggestion_preamble for entry in alt_list1: msg += str(entry) + "\n\n" else: alt_list2 = list(gen2) # discard the version_name requirement if len(alt_list2) > 0: msg += suggestion_preamble for entry in alt_list2: msg += str(entry) + "\n\n" else: alt_list3 = list(gen3) # discard the halo_finder requirement if len(alt_list3) > 0: msg += suggestion_preamble for entry in alt_list3: msg += str(entry) + "\n\n" else: msg += "There are no simulations matching your input simname.\n" raise InvalidCacheLogEntry(msg) elif len(matching_entries) == 1: log_entry = matching_entries[0] return log_entry else: msg += ("There are multiple entries in the cache log \n" "within dz_tol = "+str(self._dz_tol)+" of your inputs. \n" "Try using the exact redshift and/or decreasing dz_tol.\n" "Now printing the matching entries:\n\n") for entry in matching_entries: msg += str(entry) + "\n" raise InvalidCacheLogEntry(msg) @property def halo_table(self): """ Astropy `~astropy.table.Table` object storing a catalog of dark matter halos. You can access the array storing, say, halo virial mass using the following syntax: >>> halocat = CachedHaloCatalog() # doctest: +SKIP >>> mass_array = halocat.halo_table['halo_mvir'] # doctest: +SKIP To see what halo properties are available in the catalog: >>> print(halocat.halo_table.keys()) # doctest: +SKIP """ try: return self._halo_table except AttributeError: if self.log_entry.safe_for_cache is True: self._halo_table = Table.read(_passively_decode_string(self.fname), path='data') self._add_new_derived_columns(self._halo_table) return self._halo_table else: raise InvalidCacheLogEntry(self.log_entry._cache_safety_message) def _add_new_derived_columns(self, t): if 'halo_hostid' not in list(t.keys()): add_halo_hostid(t) if 'halo_mvir_host_halo' not in list(t.keys()): broadcast_host_halo_property(t, 'halo_mvir') def _bind_additional_metadata(self): """ Create convenience bindings of all metadata to the `CachedHaloCatalog` instance. """ if not os.path.isfile(self.log_entry.fname): msg = ("The following input fname does not exist: \n\n" + self.log_entry.fname + "\n\n") raise InvalidCacheLogEntry(msg) f = h5py.File(self.log_entry.fname, 'r') for attr_key in list(f.attrs.keys()): if attr_key == 'redshift': setattr(self, attr_key, float(get_redshift_string(f.attrs[attr_key]))) elif attr_key == 'Lbox': self.Lbox = np.empty(3) self.Lbox[:] = f.attrs['Lbox'] else: setattr(self, attr_key, f.attrs[attr_key]) f.close() matching_sim = self._retrieve_supported_sim() if matching_sim is not None: for attr in matching_sim._attrlist: if hasattr(self, attr): try: a = _passively_decode_string(getattr(self, attr)) b = _passively_decode_string(getattr(matching_sim, attr)) assert np.all(a == b) except AssertionError: msg = ("The ``" + attr + "`` metadata of the hdf5 file \n" "is inconsistent with the corresponding attribute of the \n" + matching_sim.__class__.__name__ + " class in the " "sim_manager.supported_sims module.\n" "Double-check the value of this attribute in the \n" "NbodySimulation sub-class you added to the supported_sims module. \n" ) raise HalotoolsError(msg) else: setattr(self, attr, getattr(matching_sim, attr)) def _retrieve_supported_sim(self): """ """ matching_sim = None for clname in supported_sims.__all__: try: cl = getattr(supported_sims, clname) obj = cl() if isinstance(obj, supported_sims.NbodySimulation): if compare_strings_py23_safe(self.simname, obj.simname): matching_sim = obj except TypeError: pass return matching_sim @property def ptcl_table(self): """ Astropy `~astropy.table.Table` object storing a collection of ~1e6 randomly selected dark matter particles. """ try: return self._ptcl_table except AttributeError: try: ptcl_log_entry = self.ptcl_log_entry except AttributeError: self.ptcl_log_entry = ( self._retrieve_matching_ptcl_cache_log_entry() ) ptcl_log_entry = self.ptcl_log_entry if ptcl_log_entry.safe_for_cache is True: self._ptcl_table = Table.read(_passively_decode_string(ptcl_log_entry.fname), path='data') return self._ptcl_table else: raise InvalidCacheLogEntry(ptcl_log_entry._cache_safety_message) def _disallow_catalogs_with_known_bugs(self, simname=sim_defaults.default_simname, version_name=sim_defaults.default_version_name, **kwargs): """ """ if (simname == 'bolplanck') and ('halotools_alpha_version' in version_name): msg = ("The ``{0}`` version of the ``{1}`` simulation \n" "is known to be spatially incomplete and should not be used.\n" "See https://github.com/astropy/halotools/issues/598.\n" "You can either download the original ASCII data and process it yourself, \n" "or use version_name = ``halotools_v0p4`` instead.\n") raise HalotoolsError(msg.format(version_name, simname))