Source code for sum_stat.io.writer

"""HDF5 writer for summary statistics with units and provenance metadata."""

from __future__ import annotations

import datetime
import subprocess
from pathlib import Path
from typing import Any

import h5py
import numpy as np
from astropy.cosmology import FlatLambdaCDM

import sum_stat


[docs] class SummaryStatWriter: """Write summary statistics to an HDF5 file with full units and provenance. Each statistic occupies a named group (e.g. ``"angular_2pcf/DESI_BGS"``). Every dataset carries ``unit`` and ``description`` attributes. Every group carries an ``estimator`` and survey provenance attributes. The cosmology is serialised inside each group so the file is self-contained. Parameters ---------- path : str or Path Output HDF5 file path. mode : str ``"w"`` to overwrite, ``"a"`` to append to an existing file. Examples -------- >>> writer = SummaryStatWriter("results.h5") >>> writer.write_twopcf("angular_2pcf/BGS", theta, w, cov, edges, "landy-szalay", cosmo, {}) >>> writer.close() """ def __init__(self, path: str | Path, mode: str = "w") -> None: self._path = Path(path) self._file = h5py.File(self._path, mode) self._file.attrs["created_by"] = f"sum_stat {sum_stat.__version__}" self._file.attrs["date"] = datetime.datetime.utcnow().isoformat() try: self._git_hash = ( subprocess.check_output( ["git", "rev-parse", "--short", "HEAD"], cwd=Path(__file__).parents[2], stderr=subprocess.DEVNULL, ) .decode() .strip() ) except Exception: self._git_hash = "unknown" self._file.attrs["git_hash"] = self._git_hash # ------------------------------------------------------------------ # Public write methods # ------------------------------------------------------------------
[docs] def write_lf( self, group: str, mag_centres: np.ndarray, phi: np.ndarray, phi_err: np.ndarray, cov: np.ndarray, bin_edges: np.ndarray, cosmo: FlatLambdaCDM, meta: dict[str, Any], ) -> None: """Write a luminosity function. Parameters ---------- group : str HDF5 group path, e.g. ``"lf/DESI_BGS_r"``. mag_centres : ndarray, shape (n_bins,) Magnitude bin centres [mag]. phi : ndarray, shape (n_bins,) Luminosity function [Mpc^-3 mag^-1]. phi_err : ndarray, shape (n_bins,) Uncertainty on phi [Mpc^-3 mag^-1]. cov : ndarray, shape (n_bins, n_bins) Covariance matrix [(Mpc^-3 mag^-1)^2]. bin_edges : ndarray, shape (n_bins+1,) Magnitude bin edges [mag]. cosmo : FlatLambdaCDM Cosmology used for the V_max calculation. meta : dict Extra attributes stored on the group (e.g. estimator, band, z_min, z_max). """ g = self._make_group(group, meta) self._write_cosmology(g, cosmo) self._write_dataset(g, "M_centres", mag_centres, "mag", "Absolute magnitude bin centres") self._write_dataset(g, "bin_edges", bin_edges, "mag", "Absolute magnitude bin edges") self._write_dataset(g, "phi", phi, "Mpc^-3 mag^-1", "Luminosity function phi(M)") self._write_dataset(g, "phi_err", phi_err, "Mpc^-3 mag^-1", "Uncertainty on phi(M)") self._write_dataset(g, "cov", cov, "(Mpc^-3 mag^-1)^2", "Covariance matrix of phi(M)")
[docs] def write_smf( self, group: str, mstar_centres: np.ndarray, phi: np.ndarray, phi_err: np.ndarray, cov: np.ndarray, bin_edges: np.ndarray, cosmo: FlatLambdaCDM, meta: dict[str, Any], ) -> None: """Write a stellar mass function. Parameters ---------- group : str HDF5 group path, e.g. ``"smf/DESI_BGS"``. mstar_centres : ndarray, shape (n_bins,) log10(M_*/M_sun) bin centres [dex]. phi : ndarray, shape (n_bins,) Stellar mass function [Mpc^-3 dex^-1]. phi_err : ndarray, shape (n_bins,) Uncertainty [Mpc^-3 dex^-1]. cov : ndarray, shape (n_bins, n_bins) Covariance matrix [(Mpc^-3 dex^-1)^2]. bin_edges : ndarray, shape (n_bins+1,) log10(M_*/M_sun) bin edges [dex]. cosmo : FlatLambdaCDM meta : dict """ g = self._make_group(group, meta) self._write_cosmology(g, cosmo) self._write_dataset( g, "log10mstar_centres", mstar_centres, "dex", "log10(M_*/M_sun) bin centres" ) self._write_dataset(g, "bin_edges", bin_edges, "dex", "log10(M_*/M_sun) bin edges") self._write_dataset(g, "phi", phi, "Mpc^-3 dex^-1", "Stellar mass function phi(M_*)") self._write_dataset(g, "phi_err", phi_err, "Mpc^-3 dex^-1", "Uncertainty on phi(M_*)") self._write_dataset(g, "cov", cov, "(Mpc^-3 dex^-1)^2", "Covariance matrix of phi(M_*)")
[docs] def write_twopcf( self, group: str, sep_centres: np.ndarray, xi: np.ndarray, cov: np.ndarray, bin_edges: np.ndarray, estimator: str, cosmo: FlatLambdaCDM, meta: dict[str, Any], sep_unit: str = "Mpc", xi_unit: str = "dimensionless", ) -> None: """Write a two-point correlation function. Handles w(θ), ξ(r), w_p(r_p), and individual multipoles ξ_ℓ(s). Parameters ---------- group : str HDF5 group path, e.g. ``"angular_2pcf/DESI_BGS"``. sep_centres : ndarray, shape (n_bins,) Separation bin centres. xi : ndarray, shape (n_bins,) Correlation function values. cov : ndarray, shape (n_bins, n_bins) Covariance matrix. bin_edges : ndarray, shape (n_bins+1,) Separation bin edges. estimator : str Name of the estimator (e.g. ``"landy-szalay"``). cosmo : FlatLambdaCDM meta : dict Extra attributes (e.g. pi_max_Mpc, ell, survey area). sep_unit : str Physical unit string for separation axis (e.g. ``"arcmin"``, ``"Mpc"``). xi_unit : str Physical unit string for correlation values. """ meta = dict(meta) meta.setdefault("estimator", estimator) g = self._make_group(group, meta) self._write_cosmology(g, cosmo) self._write_dataset(g, "sep_centres", sep_centres, sep_unit, "Separation bin centres") self._write_dataset(g, "bin_edges", bin_edges, sep_unit, "Separation bin edges") self._write_dataset(g, "xi", xi, xi_unit, "Correlation function") cov_unit = f"({xi_unit})^2" if xi_unit != "dimensionless" else "dimensionless" self._write_dataset(g, "cov", cov, cov_unit, "Covariance matrix")
[docs] def write_multipoles( self, group: str, s_centres: np.ndarray, xi_dict: dict[int, np.ndarray], cov: np.ndarray, bin_edges: np.ndarray, cosmo: FlatLambdaCDM, meta: dict[str, Any], sep_unit: str = "Mpc/h", ) -> None: """Write redshift-space multipoles ξ_ℓ(s). Parameters ---------- group : str HDF5 group path. s_centres : ndarray, shape (n_bins,) Separation bin centres. xi_dict : dict Keys are multipole orders (0, 2, 4); values are shape (n_bins,). cov : ndarray, shape (n_ell*n_bins, n_ell*n_bins) Stacked covariance matrix. bin_edges : ndarray, shape (n_bins+1,) Separation bin edges. cosmo : FlatLambdaCDM meta : dict sep_unit : str """ meta = dict(meta) meta["ells"] = list(xi_dict.keys()) g = self._make_group(group, meta) self._write_cosmology(g, cosmo) self._write_dataset(g, "s_centres", s_centres, sep_unit, "Separation bin centres") self._write_dataset(g, "bin_edges", bin_edges, sep_unit, "Separation bin edges") for ell, xi_l in xi_dict.items(): self._write_dataset( g, f"xi{ell}", xi_l, "dimensionless", f"Multipole ell={ell} of xi(s)" ) self._write_dataset( g, "cov", cov, "dimensionless", "Stacked covariance matrix (ells ordered as in attrs.ells)", )
[docs] def write_pk( self, group: str, k_centres: np.ndarray, pk_dict: dict[int, np.ndarray], cov: np.ndarray, bin_edges: np.ndarray, cosmo: FlatLambdaCDM, meta: dict[str, Any], ) -> None: """Write power spectrum P(k) or multipoles P_ℓ(k). Parameters ---------- group : str HDF5 group path, e.g. ``"pk3d/DESI_BGS"``. k_centres : ndarray, shape (n_k,) Wavenumber bin centres [h/Mpc]. pk_dict : dict Keys are multipole orders (0, 2, 4); values are shape (n_k,) [Mpc/h]^3. cov : ndarray, shape (n_ell*n_k, n_ell*n_k) Stacked covariance matrix [(Mpc/h)^6]. bin_edges : ndarray, shape (n_k+1,) Wavenumber bin edges [h/Mpc]. cosmo : FlatLambdaCDM meta : dict """ meta = dict(meta) meta["ells"] = list(pk_dict.keys()) g = self._make_group(group, meta) self._write_cosmology(g, cosmo) self._write_dataset(g, "k_centres", k_centres, "h/Mpc", "Wavenumber bin centres") self._write_dataset(g, "bin_edges", bin_edges, "h/Mpc", "Wavenumber bin edges") for ell, pk_l in pk_dict.items(): self._write_dataset( g, f"pk{ell}", pk_l, "(Mpc/h)^3", f"Power spectrum multipole ell={ell}" ) self._write_dataset(g, "cov", cov, "(Mpc/h)^6", "Stacked covariance matrix")
[docs] def write_cl( self, group: str, ell_eff: np.ndarray, cl: np.ndarray, nl: np.ndarray, cov: np.ndarray, bin_edges: np.ndarray, meta: dict[str, Any], ) -> None: """Write angular power spectrum C_ℓ. Parameters ---------- group : str HDF5 group path, e.g. ``"cl/DESI_BGS"``. ell_eff : ndarray, shape (n_ell,) Effective multipole per bin. cl : ndarray, shape (n_ell,) Angular power spectrum [sr]. nl : ndarray, shape (n_ell,) Shot noise power spectrum [sr]. cov : ndarray, shape (n_ell, n_ell) Covariance matrix [sr^2]. bin_edges : ndarray, shape (n_ell+1,) Multipole bin edges. meta : dict """ g = self._make_group(group, meta) self._write_dataset(g, "ell_eff", ell_eff, "dimensionless", "Effective multipole") self._write_dataset(g, "bin_edges", bin_edges, "dimensionless", "Multipole bin edges") self._write_dataset(g, "cl", cl, "sr", "Angular power spectrum C_ell") self._write_dataset(g, "nl", nl, "sr", "Shot noise power spectrum N_ell") self._write_dataset(g, "cov", cov, "sr^2", "Covariance matrix of C_ell")
[docs] def write_esd( self, group: str, rp_centres: np.ndarray, delta_sigma: np.ndarray, cov: np.ndarray, bin_edges: np.ndarray, cosmo: FlatLambdaCDM, meta: dict[str, Any], ) -> None: """Write excess surface mass density ΔΣ(r_p). Parameters ---------- group : str HDF5 group path, e.g. ``"esd/DESI_BGS_KiDS"``. rp_centres : ndarray, shape (n_bins,) Projected separation bin centres [Mpc]. delta_sigma : ndarray, shape (n_bins,) ΔΣ(r_p) [M_sun/pc^2]. cov : ndarray, shape (n_bins, n_bins) Jackknife covariance matrix [(M_sun/pc^2)^2]. bin_edges : ndarray, shape (n_bins+1,) Projected separation bin edges [Mpc]. cosmo : FlatLambdaCDM meta : dict Should include lens_survey, source_survey, m_bias, R_mean, photo_z_correction, boost_correction flags. """ g = self._make_group(group, meta) self._write_cosmology(g, cosmo) self._write_dataset(g, "rp_centres", rp_centres, "Mpc", "Projected separation bin centres") self._write_dataset(g, "bin_edges", bin_edges, "Mpc", "Projected separation bin edges") self._write_dataset( g, "delta_sigma", delta_sigma, "M_sun/pc^2", "Excess surface mass density Delta_Sigma(r_p)", ) self._write_dataset( g, "cov", cov, "(M_sun/pc^2)^2", "Jackknife covariance matrix of Delta_Sigma" )
[docs] def write_knn( self, group: str, r_centres: np.ndarray, F_k: np.ndarray, F_k_poisson: np.ndarray, bin_edges: np.ndarray, k_values: np.ndarray, cosmo: FlatLambdaCDM, meta: dict[str, Any], ) -> None: """Write kNN-CDF summary statistics. Parameters ---------- group : str HDF5 group path, e.g. ``"knn/DESI_BGS"``. r_centres : ndarray, shape (n_r,) Separation bin centres [Mpc]. F_k : ndarray, shape (n_k, n_r) Empirical kNN-CDF for each k in k_values. F_k_poisson : ndarray, shape (n_k, n_r) Erlang (Poisson) reference CDF for each k. bin_edges : ndarray, shape (n_r+1,) Separation bin edges [Mpc]. k_values : ndarray, shape (n_k,) Neighbor orders used (e.g. [1, 2, 3, 4, 5]). cosmo : FlatLambdaCDM meta : dict Extra attributes (estimator, n_query, n_gal, survey, etc.). """ meta = dict(meta) meta.setdefault("estimator", "knn-cdf") g = self._make_group(group, meta) self._write_cosmology(g, cosmo) self._write_dataset(g, "r_centres", r_centres, "Mpc", "Separation bin centres") self._write_dataset(g, "bin_edges", bin_edges, "Mpc", "Separation bin edges") self._write_dataset(g, "k_values", k_values, "dimensionless", "Neighbor orders") self._write_dataset( g, "F_k", F_k, "dimensionless", "Empirical kNN-CDF; shape (n_k, n_r)" ) self._write_dataset( g, "F_k_poisson", F_k_poisson, "dimensionless", "Poisson (Erlang) reference kNN-CDF; shape (n_k, n_r)" )
[docs] def close(self) -> None: """Close the HDF5 file.""" self._file.close()
def __enter__(self) -> SummaryStatWriter: return self def __exit__(self, *args) -> None: self.close() # ------------------------------------------------------------------ # Private helpers # ------------------------------------------------------------------ def _make_group(self, path: str, meta: dict[str, Any]) -> h5py.Group: g = self._file.require_group(path) for key, val in meta.items(): if val is not None: try: g.attrs[key] = val except TypeError: g.attrs[key] = str(val) return g def _write_dataset( self, node: h5py.Group, name: str, data: np.ndarray, unit: str, description: str, ) -> None: if name in node: del node[name] ds = node.create_dataset(name, data=np.asarray(data, dtype=np.float64)) ds.attrs["unit"] = unit ds.attrs["description"] = description def _write_cosmology(self, group: h5py.Group, cosmo: FlatLambdaCDM) -> None: cg = group.require_group("cosmology") cg.attrs["name"] = getattr(cosmo, "name", "") or "" for name, val, unit in [ ("H0", float(cosmo.H0.value), "km/s/Mpc"), ("h", float(cosmo.H0.value / 100.0), "dimensionless"), ("Om0", float(cosmo.Om0), "dimensionless"), ("Ob0", float(cosmo.Ob0) if cosmo.Ob0 is not None else float("nan"), "dimensionless"), ("Ok0", 0.0, "dimensionless"), ]: ds = cg.require_dataset(name, shape=(), dtype=np.float64) ds[()] = val ds.attrs["unit"] = unit