"""HDF5 writer for summary statistics with units and provenance metadata."""
from __future__ import annotations
import datetime
import subprocess
from pathlib import Path
from typing import Any
import h5py
import numpy as np
from astropy.cosmology import FlatLambdaCDM
import sum_stat
[docs]
class SummaryStatWriter:
"""Write summary statistics to an HDF5 file with full units and provenance.
Each statistic occupies a named group (e.g. ``"angular_2pcf/DESI_BGS"``).
Every dataset carries ``unit`` and ``description`` attributes.
Every group carries an ``estimator`` and survey provenance attributes.
The cosmology is serialised inside each group so the file is self-contained.
Parameters
----------
path : str or Path
Output HDF5 file path.
mode : str
``"w"`` to overwrite, ``"a"`` to append to an existing file.
Examples
--------
>>> writer = SummaryStatWriter("results.h5")
>>> writer.write_twopcf("angular_2pcf/BGS", theta, w, cov, edges, "landy-szalay", cosmo, {})
>>> writer.close()
"""
def __init__(self, path: str | Path, mode: str = "w") -> None:
self._path = Path(path)
self._file = h5py.File(self._path, mode)
self._file.attrs["created_by"] = f"sum_stat {sum_stat.__version__}"
self._file.attrs["date"] = datetime.datetime.utcnow().isoformat()
try:
self._git_hash = (
subprocess.check_output(
["git", "rev-parse", "--short", "HEAD"],
cwd=Path(__file__).parents[2],
stderr=subprocess.DEVNULL,
)
.decode()
.strip()
)
except Exception:
self._git_hash = "unknown"
self._file.attrs["git_hash"] = self._git_hash
# ------------------------------------------------------------------
# Public write methods
# ------------------------------------------------------------------
[docs]
def write_lf(
self,
group: str,
mag_centres: np.ndarray,
phi: np.ndarray,
phi_err: np.ndarray,
cov: np.ndarray,
bin_edges: np.ndarray,
cosmo: FlatLambdaCDM,
meta: dict[str, Any],
) -> None:
"""Write a luminosity function.
Parameters
----------
group : str
HDF5 group path, e.g. ``"lf/DESI_BGS_r"``.
mag_centres : ndarray, shape (n_bins,)
Magnitude bin centres [mag].
phi : ndarray, shape (n_bins,)
Luminosity function [Mpc^-3 mag^-1].
phi_err : ndarray, shape (n_bins,)
Uncertainty on phi [Mpc^-3 mag^-1].
cov : ndarray, shape (n_bins, n_bins)
Covariance matrix [(Mpc^-3 mag^-1)^2].
bin_edges : ndarray, shape (n_bins+1,)
Magnitude bin edges [mag].
cosmo : FlatLambdaCDM
Cosmology used for the V_max calculation.
meta : dict
Extra attributes stored on the group (e.g. estimator, band, z_min, z_max).
"""
g = self._make_group(group, meta)
self._write_cosmology(g, cosmo)
self._write_dataset(g, "M_centres", mag_centres, "mag", "Absolute magnitude bin centres")
self._write_dataset(g, "bin_edges", bin_edges, "mag", "Absolute magnitude bin edges")
self._write_dataset(g, "phi", phi, "Mpc^-3 mag^-1", "Luminosity function phi(M)")
self._write_dataset(g, "phi_err", phi_err, "Mpc^-3 mag^-1", "Uncertainty on phi(M)")
self._write_dataset(g, "cov", cov, "(Mpc^-3 mag^-1)^2", "Covariance matrix of phi(M)")
[docs]
def write_smf(
self,
group: str,
mstar_centres: np.ndarray,
phi: np.ndarray,
phi_err: np.ndarray,
cov: np.ndarray,
bin_edges: np.ndarray,
cosmo: FlatLambdaCDM,
meta: dict[str, Any],
) -> None:
"""Write a stellar mass function.
Parameters
----------
group : str
HDF5 group path, e.g. ``"smf/DESI_BGS"``.
mstar_centres : ndarray, shape (n_bins,)
log10(M_*/M_sun) bin centres [dex].
phi : ndarray, shape (n_bins,)
Stellar mass function [Mpc^-3 dex^-1].
phi_err : ndarray, shape (n_bins,)
Uncertainty [Mpc^-3 dex^-1].
cov : ndarray, shape (n_bins, n_bins)
Covariance matrix [(Mpc^-3 dex^-1)^2].
bin_edges : ndarray, shape (n_bins+1,)
log10(M_*/M_sun) bin edges [dex].
cosmo : FlatLambdaCDM
meta : dict
"""
g = self._make_group(group, meta)
self._write_cosmology(g, cosmo)
self._write_dataset(
g, "log10mstar_centres", mstar_centres, "dex", "log10(M_*/M_sun) bin centres"
)
self._write_dataset(g, "bin_edges", bin_edges, "dex", "log10(M_*/M_sun) bin edges")
self._write_dataset(g, "phi", phi, "Mpc^-3 dex^-1", "Stellar mass function phi(M_*)")
self._write_dataset(g, "phi_err", phi_err, "Mpc^-3 dex^-1", "Uncertainty on phi(M_*)")
self._write_dataset(g, "cov", cov, "(Mpc^-3 dex^-1)^2", "Covariance matrix of phi(M_*)")
[docs]
def write_twopcf(
self,
group: str,
sep_centres: np.ndarray,
xi: np.ndarray,
cov: np.ndarray,
bin_edges: np.ndarray,
estimator: str,
cosmo: FlatLambdaCDM,
meta: dict[str, Any],
sep_unit: str = "Mpc",
xi_unit: str = "dimensionless",
) -> None:
"""Write a two-point correlation function.
Handles w(θ), ξ(r), w_p(r_p), and individual multipoles ξ_ℓ(s).
Parameters
----------
group : str
HDF5 group path, e.g. ``"angular_2pcf/DESI_BGS"``.
sep_centres : ndarray, shape (n_bins,)
Separation bin centres.
xi : ndarray, shape (n_bins,)
Correlation function values.
cov : ndarray, shape (n_bins, n_bins)
Covariance matrix.
bin_edges : ndarray, shape (n_bins+1,)
Separation bin edges.
estimator : str
Name of the estimator (e.g. ``"landy-szalay"``).
cosmo : FlatLambdaCDM
meta : dict
Extra attributes (e.g. pi_max_Mpc, ell, survey area).
sep_unit : str
Physical unit string for separation axis (e.g. ``"arcmin"``, ``"Mpc"``).
xi_unit : str
Physical unit string for correlation values.
"""
meta = dict(meta)
meta.setdefault("estimator", estimator)
g = self._make_group(group, meta)
self._write_cosmology(g, cosmo)
self._write_dataset(g, "sep_centres", sep_centres, sep_unit, "Separation bin centres")
self._write_dataset(g, "bin_edges", bin_edges, sep_unit, "Separation bin edges")
self._write_dataset(g, "xi", xi, xi_unit, "Correlation function")
cov_unit = f"({xi_unit})^2" if xi_unit != "dimensionless" else "dimensionless"
self._write_dataset(g, "cov", cov, cov_unit, "Covariance matrix")
[docs]
def write_multipoles(
self,
group: str,
s_centres: np.ndarray,
xi_dict: dict[int, np.ndarray],
cov: np.ndarray,
bin_edges: np.ndarray,
cosmo: FlatLambdaCDM,
meta: dict[str, Any],
sep_unit: str = "Mpc/h",
) -> None:
"""Write redshift-space multipoles ξ_ℓ(s).
Parameters
----------
group : str
HDF5 group path.
s_centres : ndarray, shape (n_bins,)
Separation bin centres.
xi_dict : dict
Keys are multipole orders (0, 2, 4); values are shape (n_bins,).
cov : ndarray, shape (n_ell*n_bins, n_ell*n_bins)
Stacked covariance matrix.
bin_edges : ndarray, shape (n_bins+1,)
Separation bin edges.
cosmo : FlatLambdaCDM
meta : dict
sep_unit : str
"""
meta = dict(meta)
meta["ells"] = list(xi_dict.keys())
g = self._make_group(group, meta)
self._write_cosmology(g, cosmo)
self._write_dataset(g, "s_centres", s_centres, sep_unit, "Separation bin centres")
self._write_dataset(g, "bin_edges", bin_edges, sep_unit, "Separation bin edges")
for ell, xi_l in xi_dict.items():
self._write_dataset(
g, f"xi{ell}", xi_l, "dimensionless", f"Multipole ell={ell} of xi(s)"
)
self._write_dataset(
g,
"cov",
cov,
"dimensionless",
"Stacked covariance matrix (ells ordered as in attrs.ells)",
)
[docs]
def write_pk(
self,
group: str,
k_centres: np.ndarray,
pk_dict: dict[int, np.ndarray],
cov: np.ndarray,
bin_edges: np.ndarray,
cosmo: FlatLambdaCDM,
meta: dict[str, Any],
) -> None:
"""Write power spectrum P(k) or multipoles P_ℓ(k).
Parameters
----------
group : str
HDF5 group path, e.g. ``"pk3d/DESI_BGS"``.
k_centres : ndarray, shape (n_k,)
Wavenumber bin centres [h/Mpc].
pk_dict : dict
Keys are multipole orders (0, 2, 4); values are shape (n_k,) [Mpc/h]^3.
cov : ndarray, shape (n_ell*n_k, n_ell*n_k)
Stacked covariance matrix [(Mpc/h)^6].
bin_edges : ndarray, shape (n_k+1,)
Wavenumber bin edges [h/Mpc].
cosmo : FlatLambdaCDM
meta : dict
"""
meta = dict(meta)
meta["ells"] = list(pk_dict.keys())
g = self._make_group(group, meta)
self._write_cosmology(g, cosmo)
self._write_dataset(g, "k_centres", k_centres, "h/Mpc", "Wavenumber bin centres")
self._write_dataset(g, "bin_edges", bin_edges, "h/Mpc", "Wavenumber bin edges")
for ell, pk_l in pk_dict.items():
self._write_dataset(
g, f"pk{ell}", pk_l, "(Mpc/h)^3", f"Power spectrum multipole ell={ell}"
)
self._write_dataset(g, "cov", cov, "(Mpc/h)^6", "Stacked covariance matrix")
[docs]
def write_cl(
self,
group: str,
ell_eff: np.ndarray,
cl: np.ndarray,
nl: np.ndarray,
cov: np.ndarray,
bin_edges: np.ndarray,
meta: dict[str, Any],
) -> None:
"""Write angular power spectrum C_ℓ.
Parameters
----------
group : str
HDF5 group path, e.g. ``"cl/DESI_BGS"``.
ell_eff : ndarray, shape (n_ell,)
Effective multipole per bin.
cl : ndarray, shape (n_ell,)
Angular power spectrum [sr].
nl : ndarray, shape (n_ell,)
Shot noise power spectrum [sr].
cov : ndarray, shape (n_ell, n_ell)
Covariance matrix [sr^2].
bin_edges : ndarray, shape (n_ell+1,)
Multipole bin edges.
meta : dict
"""
g = self._make_group(group, meta)
self._write_dataset(g, "ell_eff", ell_eff, "dimensionless", "Effective multipole")
self._write_dataset(g, "bin_edges", bin_edges, "dimensionless", "Multipole bin edges")
self._write_dataset(g, "cl", cl, "sr", "Angular power spectrum C_ell")
self._write_dataset(g, "nl", nl, "sr", "Shot noise power spectrum N_ell")
self._write_dataset(g, "cov", cov, "sr^2", "Covariance matrix of C_ell")
[docs]
def write_esd(
self,
group: str,
rp_centres: np.ndarray,
delta_sigma: np.ndarray,
cov: np.ndarray,
bin_edges: np.ndarray,
cosmo: FlatLambdaCDM,
meta: dict[str, Any],
) -> None:
"""Write excess surface mass density ΔΣ(r_p).
Parameters
----------
group : str
HDF5 group path, e.g. ``"esd/DESI_BGS_KiDS"``.
rp_centres : ndarray, shape (n_bins,)
Projected separation bin centres [Mpc].
delta_sigma : ndarray, shape (n_bins,)
ΔΣ(r_p) [M_sun/pc^2].
cov : ndarray, shape (n_bins, n_bins)
Jackknife covariance matrix [(M_sun/pc^2)^2].
bin_edges : ndarray, shape (n_bins+1,)
Projected separation bin edges [Mpc].
cosmo : FlatLambdaCDM
meta : dict
Should include lens_survey, source_survey, m_bias, R_mean,
photo_z_correction, boost_correction flags.
"""
g = self._make_group(group, meta)
self._write_cosmology(g, cosmo)
self._write_dataset(g, "rp_centres", rp_centres, "Mpc", "Projected separation bin centres")
self._write_dataset(g, "bin_edges", bin_edges, "Mpc", "Projected separation bin edges")
self._write_dataset(
g,
"delta_sigma",
delta_sigma,
"M_sun/pc^2",
"Excess surface mass density Delta_Sigma(r_p)",
)
self._write_dataset(
g, "cov", cov, "(M_sun/pc^2)^2", "Jackknife covariance matrix of Delta_Sigma"
)
[docs]
def write_knn(
self,
group: str,
r_centres: np.ndarray,
F_k: np.ndarray,
F_k_poisson: np.ndarray,
bin_edges: np.ndarray,
k_values: np.ndarray,
cosmo: FlatLambdaCDM,
meta: dict[str, Any],
) -> None:
"""Write kNN-CDF summary statistics.
Parameters
----------
group : str
HDF5 group path, e.g. ``"knn/DESI_BGS"``.
r_centres : ndarray, shape (n_r,)
Separation bin centres [Mpc].
F_k : ndarray, shape (n_k, n_r)
Empirical kNN-CDF for each k in k_values.
F_k_poisson : ndarray, shape (n_k, n_r)
Erlang (Poisson) reference CDF for each k.
bin_edges : ndarray, shape (n_r+1,)
Separation bin edges [Mpc].
k_values : ndarray, shape (n_k,)
Neighbor orders used (e.g. [1, 2, 3, 4, 5]).
cosmo : FlatLambdaCDM
meta : dict
Extra attributes (estimator, n_query, n_gal, survey, etc.).
"""
meta = dict(meta)
meta.setdefault("estimator", "knn-cdf")
g = self._make_group(group, meta)
self._write_cosmology(g, cosmo)
self._write_dataset(g, "r_centres", r_centres, "Mpc", "Separation bin centres")
self._write_dataset(g, "bin_edges", bin_edges, "Mpc", "Separation bin edges")
self._write_dataset(g, "k_values", k_values, "dimensionless", "Neighbor orders")
self._write_dataset(
g, "F_k", F_k, "dimensionless",
"Empirical kNN-CDF; shape (n_k, n_r)"
)
self._write_dataset(
g, "F_k_poisson", F_k_poisson, "dimensionless",
"Poisson (Erlang) reference kNN-CDF; shape (n_k, n_r)"
)
[docs]
def close(self) -> None:
"""Close the HDF5 file."""
self._file.close()
def __enter__(self) -> SummaryStatWriter:
return self
def __exit__(self, *args) -> None:
self.close()
# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------
def _make_group(self, path: str, meta: dict[str, Any]) -> h5py.Group:
g = self._file.require_group(path)
for key, val in meta.items():
if val is not None:
try:
g.attrs[key] = val
except TypeError:
g.attrs[key] = str(val)
return g
def _write_dataset(
self,
node: h5py.Group,
name: str,
data: np.ndarray,
unit: str,
description: str,
) -> None:
if name in node:
del node[name]
ds = node.create_dataset(name, data=np.asarray(data, dtype=np.float64))
ds.attrs["unit"] = unit
ds.attrs["description"] = description
def _write_cosmology(self, group: h5py.Group, cosmo: FlatLambdaCDM) -> None:
cg = group.require_group("cosmology")
cg.attrs["name"] = getattr(cosmo, "name", "") or ""
for name, val, unit in [
("H0", float(cosmo.H0.value), "km/s/Mpc"),
("h", float(cosmo.H0.value / 100.0), "dimensionless"),
("Om0", float(cosmo.Om0), "dimensionless"),
("Ob0", float(cosmo.Ob0) if cosmo.Ob0 is not None else float("nan"), "dimensionless"),
("Ok0", 0.0, "dimensionless"),
]:
ds = cg.require_dataset(name, shape=(), dtype=np.float64)
ds[()] = val
ds.attrs["unit"] = unit