diff --git a/docs/api/preprocessing.md b/docs/api/preprocessing.md index d820337987..a6acbfdcc7 100644 --- a/docs/api/preprocessing.md +++ b/docs/api/preprocessing.md @@ -29,6 +29,7 @@ For visual quality control, see {func}`~scanpy.pl.highest_expr_genes` and pp.log1p pp.pca pp.normalize_total + pp.normalize_clr pp.regress_out pp.scale pp.sample diff --git a/docs/references.bib b/docs/references.bib index aa2261fe7b..906747e0c2 100644 --- a/docs/references.bib +++ b/docs/references.bib @@ -111,6 +111,16 @@ @article{Blondel2008 pages = {P10008}, } +@article{Booeshaghi2022, + author = {Booeshaghi, A. Sina and Hallgrímsdóttir, Ingileif B. and Gálvez-Merchán, Ángel and Pachter, Lior}, + title = {Depth normalization for single-cell genomics count data}, + year = {2026}, + url = {https://doi.org/10.1101/2022.05.06.490859}, + doi = {10.1101/2022.05.06.490859}, + publisher = {Cold Spring Harbor Laboratory}, + journal = {bioRxiv}, +} + @article{Burczynski2006, author = {Burczynski, Michael E. and Peterson, Ron L. and Twine, Natalie C. and Zuberek, Krystyna A. and Brodeur, Brendan J. and Casciotti, Lori and Maganti, Vasu and Reddy, Padma S. and Strahs, Andrew and Immermann, Fred and Spinelli, Walter and Schwertschlag, Ulrich and Slager, Anna M. and Cotreau, Monette M. and Dorner, Andrew J.}, title = {Molecular Classification of Crohn’s Disease and Ulcerative Colitis Patients Using Transcriptional Profiles in Peripheral Blood Mononuclear Cells}, diff --git a/docs/release-notes/4160.feat.md b/docs/release-notes/4160.feat.md new file mode 100644 index 0000000000..02b93ce488 --- /dev/null +++ b/docs/release-notes/4160.feat.md @@ -0,0 +1 @@ +Add {func}`scanpy.pp.normalize_clr` for shifted centered log-ratio (PFlog1pPF) normalization, a variance-stabilizing, depth-invariant and rank-preserving count transform {cite:p}`Booeshaghi2022` {smaller}`R Baber` diff --git a/src/scanpy/preprocessing/__init__.py b/src/scanpy/preprocessing/__init__.py index cb1aedb90c..974e133124 100644 --- a/src/scanpy/preprocessing/__init__.py +++ b/src/scanpy/preprocessing/__init__.py @@ -7,7 +7,7 @@ from ._deprecated.sampling import subsample from ._harmony import harmony_integrate from ._highly_variable_genes import highly_variable_genes -from ._normalization import normalize_total +from ._normalization import normalize_clr, normalize_total from ._pca import pca from ._qc import calculate_qc_metrics from ._recipes import recipe_seurat, recipe_weinreb17, recipe_zheng17 @@ -33,6 +33,7 @@ "highly_variable_genes", "log1p", "neighbors", + "normalize_clr", "normalize_total", "pca", "recipe_seurat", diff --git a/src/scanpy/preprocessing/_normalization.py b/src/scanpy/preprocessing/_normalization.py index 79eb5cf0d1..a16fe0ae1e 100644 --- a/src/scanpy/preprocessing/_normalization.py +++ b/src/scanpy/preprocessing/_normalization.py @@ -7,6 +7,7 @@ import numpy as np from fast_array_utils import stats from fast_array_utils.numba import njit +from fast_array_utils.stats import mean_var from .. import logging as logg from .._compat import CSBase, CSCBase, CSRBase, DaskArray, warn @@ -14,6 +15,8 @@ from ..get import _get_obs_rep, _set_obs_rep if TYPE_CHECKING: + from typing import Literal + from anndata import AnnData @@ -304,3 +307,262 @@ def normalize_total( # noqa: PLR0912 elif not inplace: return dat return None + + +# ----------------------------------------------------------------------------- +# Shifted Centered Log-Ratio (PFlogPF) Core Implementation +# +# The mathematical logic and sparse matrix optimization ("offset trick") +# implemented below are adapted from the Pachter Lab reference repository: +# https://github.com/pachterlab/bhgp_2022 +# +# Reference Publication: +# Booeshaghi et al. (2022, 2026) "Depth normalization for single-cell genomics +# count data" bioRxiv. doi: 10.1101/2022.05.06.490859 +# +# Copyright (c) 2022, Pachter Lab. All rights reserved. +# Licensed under the BSD 2-Clause License. +# ----------------------------------------------------------------------------- + + +def _estimate_overdispersion(x: np.ndarray | CSBase | DaskArray) -> float: + r"""Estimate the negative-binomial overdispersion :math:`α` from raw counts. + + Fits :math:`\mathrm{Var}_g = μ_g + α \cdot μ_g^2` across genes, where + :math:`μ_g` and :math:`\mathrm{Var}_g` are the per-gene mean and (population) + variance over cells. The model is linear in :math:`α`, so the ordinary + least-squares solution is closed form + + .. math:: + α = \frac{\sum_g (\mathrm{Var}_g - μ_g) \, μ_g^2}{\sum_g μ_g^4}, + + which is exactly the minimizer a non-linear `curve_fit` would converge to, + but without the dependency. :func:`~fast_array_utils.stats.mean_var` is + dispatched for dense, sparse and dask input alike, so the only dask-specific + step is computing the two final scalar sums. + """ + mu, var = mean_var(x, axis=0, correction=0) + mu2 = mu**2 + numerator = np.sum((var - mu) * mu2) + denominator = np.sum(mu2 * mu2) + if isinstance(x, DaskArray): + import dask + + numerator, denominator = dask.compute(numerator, denominator) + if denominator == 0.0: + msg = ( + "Cannot estimate overdispersion: every gene has zero mean. " + "Pass `alpha` or `target_sum` explicitly." + ) + raise ValueError(msg) + return float(numerator / denominator) + + +def _clr_log_center(u: np.ndarray | CSBase) -> np.ndarray: + """Apply log1p then per-cell mean-centering to PF-scaled counts; return dense. + + Operates on a full-width block of cells: every gene must be present so the + per-cell mean spans the whole row. Used directly for in-memory input, and + per row-chunk under :meth:`~dask.array.Array.map_blocks` (after the genes + have been rechunked into a single chunk). + """ + n_genes = u.shape[1] + if isinstance(u, CSBase): + # Sparse "offset trick": store log1p only at the nonzero entries. The + # structural zeros already hold log1p(0) = 0, so the per-cell mean is just + # the stored row sum / n_genes, and no densification is needed until the + # final centering step. + u.data = np.log1p(u.data) + per_cell_mean = np.asarray(u.sum(axis=1)).ravel() / n_genes + dense_log = u.toarray() + else: + u = np.asarray(u) + dense_log = np.log1p(u) + per_cell_mean = dense_log.mean(axis=1) + + # Additive centering (the CLR step): map each cell onto the zero-sum + # (Aitchison) hyperplane. This is what fills the zeros and densifies the matrix. + return dense_log - per_cell_mean[:, None] + + +def _normalize_clr_helper( + x: np.ndarray | CSBase | DaskArray, + *, + target_sum: float | None, + alpha: float | Literal["auto"] | None, +) -> tuple[np.ndarray | DaskArray, np.ndarray | DaskArray]: + """Compute the shifted CLR (PFlog1pPF) transform backend logic. + + This function acts as the internal processing engine for `normalize_clr`, + adapted from the reference implementations provided by the Pachter Lab + (Booeshaghi et al. 2022). It applies initial depth-scaling followed by + an optimized sparse log-shift operation before mapping coordinates onto + the zero-sum Aitchison hyperplane. + + Returns the dense, per-cell-centered matrix and the raw per-cell depths + (the latter is used by the caller to warn about empty cells). + + See `normalize_clr` for the meaning of the parameters. + """ + # Keep the depths lazy for dask; `.ravel()` would otherwise materialize them. + cell_depths = stats.sum(x, axis=1) + if not isinstance(x, DaskArray): + cell_depths = np.asarray(cell_depths).ravel() + + # Resolve `target_sum` (the paper's PF target constant K, Booeshaghi et al. + # 2022): the depth the first proportional-fitting step scales every cell to. + if alpha is not None: + if alpha == "auto": + # Estimate the overdispersion from the raw counts (closed-form OLS, + # as in the reference implementation). + alpha = _estimate_overdispersion(x) + if not alpha > 0: + msg = ( + f"`alpha` must be positive to derive K = 4 * alpha * s, got {alpha}. " + "The data may be underdispersed; pass `target_sum` explicitly instead." + ) + raise ValueError(msg) + # Delta-method calibration: K = 4 * alpha * s, where s is the mean cell + # depth. This sets the count-scale pseudocount to the variance-stabilizing + # value y0 = 1 / (4 * alpha), and overrides any value passed as `target_sum`. + target_sum = 4.0 * alpha * float(cell_depths.mean()) + elif target_sum is None: + # `float(...)` triggers a single blocking `.compute()` for dask input. + target_sum = float(cell_depths.mean()) + # else: use the `target_sum` passed by the caller. + + # Proportional fitting: rescale each cell so its counts sum to `target_sum` (K). + # Dividing by `cell_depths / target_sum` (with allow_divide_by_zero=False) + # leaves empty cells as all-zero rows instead of producing invalid infinities. + u = axis_mul_or_truediv( + x, cell_depths / target_sum, op=truediv, axis=0, allow_divide_by_zero=False + ) + + if isinstance(x, DaskArray): + # Per-cell centering needs the whole row, so collapse the genes into a + # single chunk; the transform is then embarrassingly parallel over the + # row-chunks. The output is always dense, hence the dense numpy `meta`. + u = u.rechunk({1: -1}) + dense_log = u.map_blocks( + _clr_log_center, dtype=np.float64, meta=np.array([], dtype=np.float64) + ) + else: + dense_log = _clr_log_center(u) + + return dense_log, cell_depths + + +def normalize_clr( + adata: AnnData, + *, + target_sum: float | None = None, + alpha: float | Literal["auto"] | None = None, + layer: str | None = None, + inplace: bool = True, + copy: bool = False, +) -> AnnData | dict[str, np.ndarray] | None: + r"""Normalize counts with the shifted centered log-ratio (PFlog1pPF) transform. + + Computes the shifted centered log-ratio (CLR) transform + + .. math:: + T(x)_i = \log(u_i + 1) - \frac{1}{D} \sum_{j=1}^D \log(u_j + 1), + + where :math:`u_i = K \, x_i / \sum_j x_j` are the depth-normalized counts + (proportional fitting to a target depth :math:`K`) and :math:`D` is the + number of genes. Equivalently this is proportional fitting, then + :obj:`~numpy.log1p`, then per-cell mean-centering in log space (the + centered-log-ratio step). This count transform is simultaneously + variance-stabilizing, depth-invariant, and rank-preserving :cite:p:`Booeshaghi2022`. + + Because the per-cell centering fills the zero entries, the output is always + dense, and each cell sums to exactly zero. + + .. note:: + When used with a :class:`~dask.array.Array` in `adata.X`, deriving the + proportional-fitting target :math:`K` from the data requires a global + reduction and therefore triggers a blocking `.compute()`: once for the + default mean-depth target, and once more for `alpha` (including + ``alpha="auto"``). Only the scalar reduction is materialized, not the + matrix. Passing `target_sum` explicitly avoids this and keeps the whole + operation lazy. + + Parameters + ---------- + adata + The annotated data matrix of shape `n_obs` × `n_vars`. + Rows correspond to cells and columns to genes. + target_sum + Target depth :math:`K` for the first proportional-fitting step. If `None` + (and `alpha` is not given), the empirical mean cell depth is used. Unlike + :func:`~scanpy.pp.normalize_total`, this is only an *intermediate* target: + cells sum to `target_sum` after proportional fitting, but the subsequent + log and centering steps make each cell sum to exactly zero in the output. + alpha + Negative-binomial overdispersion of the dataset (``var = μ + α·μ²``). When + given, it overrides `target_sum` and sets :math:`K = 4 \cdot α \cdot s` by + the delta method, where :math:`s` is the mean cell depth, calibrating the + count-scale pseudocount to the variance-stabilizing value ``y0 = 1/(4·α)`` + :cite:p:`Booeshaghi2022`. Pass ``"auto"`` to estimate :math:`α` from the + data (closed-form least squares of ``var = μ + α·μ²`` across genes). A + :class:`ValueError` is raised if the estimated or supplied :math:`α` is + not positive (e.g. underdispersed data); pass `target_sum` instead. + layer + Layer to normalize instead of `X`. + inplace + Whether to update `adata` or return a dictionary with the normalized + matrix. + copy + Whether to modify a copied input object. Not compatible with + `inplace=False`. + + Returns + ------- + Returns a dictionary with the normalized copy of `adata.X` (key `"X"`) or + updates `adata` with the normalized matrix, depending on `inplace`. + + Example + ------- + >>> import numpy as np + >>> from anndata import AnnData + >>> import scanpy as sc + >>> adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]], dtype="float32")) + >>> sc.pp.normalize_clr(adata) + >>> np.allclose(adata.X.sum(axis=1), 0, atol=1e-6) + True + """ + if copy: + if not inplace: + msg = "`copy=True` cannot be used with `inplace=False`." + raise ValueError(msg) + adata = adata.copy() + + view_to_actual(adata) + + x = _get_obs_rep(adata, layer=layer) + if isinstance(x, CSCBase): + x = x.tocsr() + if issubclass(x.dtype.type, int | np.integer): + x = x.astype(np.float64) + + start = logg.info("normalizing counts per cell via shifted CLR") + + x, cell_depths = _normalize_clr_helper(x, target_sum=target_sum, alpha=alpha) + + if not isinstance(cell_depths, DaskArray) and not np.all(cell_depths > 0): + warn("Some cells have zero counts", UserWarning) + + dat = dict(X=x) + if inplace: + _set_obs_rep(adata, dat["X"], layer=layer) + + logg.info( + " finished ({time_passed})", + time=start, + ) + + if copy: + return adata + elif not inplace: + return dat + return None diff --git a/tests/test_normalization.py b/tests/test_normalization.py index 50347c8ab9..0fc4caca5d 100644 --- a/tests/test_normalization.py +++ b/tests/test_normalization.py @@ -11,6 +11,7 @@ from scipy import sparse import scanpy as sc +from scanpy._compat import CSBase from scanpy.preprocessing._normalization import _compute_nnz_median from testing.scanpy._helpers import ( _check_check_values_warnings, @@ -19,7 +20,11 @@ ) # TODO: Add support for sparse-in-dask -from testing.scanpy._pytest.params import ARRAY_TYPES, ARRAY_TYPES_DENSE +from testing.scanpy._pytest.params import ( + ARRAY_TYPES, + ARRAY_TYPES_DENSE, + ARRAY_TYPES_MEM, +) if TYPE_CHECKING: from collections.abc import Callable @@ -329,3 +334,199 @@ def test_compute_nnz_median(array_type, dtype): data = np.array([0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=dtype) data = array_type(data) np.testing.assert_allclose(_compute_nnz_median(data), 5) + + +# ------------------------------------------------------------------------------ +# normalize_clr (shifted CLR / PFlog1pPF) +# ------------------------------------------------------------------------------ + +# A small count matrix with no empty cells, used for the value/equivalence tests. +X_clr = np.array( + [[5, 0, 3, 2], [1, 1, 0, 4], [0, 7, 2, 1], [3, 3, 3, 3]], dtype="float32" +) + + +def _estimate_alpha_reference(x) -> float: + """Closed-form OLS overdispersion, independent of the implementation.""" + x = np.asarray(to_ndarray(x), dtype=np.float64) + mu = x.mean(axis=0) + var = (x**2).mean(axis=0) - mu**2 + mu2 = mu**2 + return float(np.sum((var - mu) * mu2) / np.sum(mu2 * mu2)) + + +def _clr_reference(x, *, target_sum=None, alpha=None) -> np.ndarray: + """Self-contained dense shifted-CLR, independent of the implementation. + + PF to a target depth, log1p, then subtract the per-cell mean. Empty cells + (zero depth) are left as all-zero rows, matching `normalize_clr`. + """ + x = np.asarray(to_ndarray(x), dtype=np.float64) + depths = x.sum(axis=1) + # Resolve the PF target depth K (paper's notation; exposed as `target_sum`). + if alpha is not None: + if alpha == "auto": + alpha = _estimate_alpha_reference(x) + target_sum = 4.0 * alpha * depths.mean() + elif target_sum is None: + target_sum = depths.mean() + safe_depths = np.where(depths == 0, 1.0, depths) + u = x * (target_sum / safe_depths)[:, None] + log_u = np.log1p(u) + return log_u - log_u.mean(axis=1, keepdims=True) + + +@pytest.mark.parametrize("array_type", ARRAY_TYPES) +@pytest.mark.parametrize("dtype", ["float32", "int64"]) +def test_normalize_clr_values(array_type, dtype): + """Values match the reference and every cell sums to zero, for all layouts. + + Asserting that numpy / csr / csc / sparse-array inputs all equal the same + dense reference also proves the sparse "offset trick" matches the dense path. + """ + adata = AnnData(array_type(X_clr).astype(dtype)) + sc.pp.normalize_clr(adata) + result = to_ndarray(adata.X) + + np.testing.assert_allclose(result, _clr_reference(X_clr), rtol=1e-5, atol=1e-5) + # zero-sum (Aitchison) hyperplane + np.testing.assert_allclose(result.sum(axis=1), 0.0, atol=1e-5) + + +@pytest.mark.parametrize("array_type", ARRAY_TYPES) +@pytest.mark.parametrize( + "kwargs", + [{}, {"target_sum": 1e4}, {"alpha": 0.5}, {"alpha": "auto"}], + ids=["default", "target_sum", "alpha", "alpha_auto"], +) +def test_normalize_clr_params(array_type, kwargs): + adata = AnnData(array_type(X_clr).astype("float32")) + sc.pp.normalize_clr(adata, **kwargs) + np.testing.assert_allclose( + to_ndarray(adata.X), _clr_reference(X_clr, **kwargs), rtol=1e-5, atol=1e-5 + ) + + +def test_normalize_clr_alpha_overrides_target_sum(): + """`alpha` sets target_sum = 4*alpha*scale and overrides any given `target_sum`.""" + alpha = 0.5 + scale = X_clr.sum(axis=1).mean() + + via_alpha = AnnData(sparse.csr_matrix(X_clr)) # noqa: TID251 + sc.pp.normalize_clr(via_alpha, alpha=alpha) + + via_target = AnnData(sparse.csr_matrix(X_clr)) # noqa: TID251 + sc.pp.normalize_clr(via_target, target_sum=4.0 * alpha * scale) + np.testing.assert_allclose( + to_ndarray(via_alpha.X), to_ndarray(via_target.X), rtol=1e-5, atol=1e-5 + ) + + # passing both -> alpha wins, target_sum ignored + both = AnnData(sparse.csr_matrix(X_clr)) # noqa: TID251 + sc.pp.normalize_clr(both, alpha=alpha, target_sum=999.0) + np.testing.assert_allclose( + to_ndarray(both.X), to_ndarray(via_alpha.X), rtol=1e-5, atol=1e-5 + ) + + +@pytest.mark.parametrize("array_type", ARRAY_TYPES) +def test_normalize_clr_alpha_auto(array_type): + """`alpha="auto"` estimates the overdispersion and matches an explicit alpha.""" + estimated = _estimate_alpha_reference(X_clr) + assert estimated > 0 + + auto = AnnData(array_type(X_clr).astype("float32")) + sc.pp.normalize_clr(auto, alpha="auto") + + explicit = AnnData(array_type(X_clr).astype("float32")) + sc.pp.normalize_clr(explicit, alpha=estimated) + np.testing.assert_allclose( + to_ndarray(auto.X), to_ndarray(explicit.X), rtol=1e-5, atol=1e-5 + ) + + +@pytest.mark.parametrize("alpha", [0.0, -0.5], ids=["zero", "negative"]) +def test_normalize_clr_nonpositive_alpha_raises(alpha): + """A non-positive `alpha` cannot derive K = 4*alpha*s and raises.""" + adata = AnnData(sparse.csr_matrix(X_clr)) # noqa: TID251 + with pytest.raises(ValueError, match=r"alpha.*positive"): + sc.pp.normalize_clr(adata, alpha=alpha) + + +def test_normalize_clr_alpha_auto_zero_mean_raises(): + """`alpha="auto"` cannot estimate overdispersion when every gene mean is zero.""" + adata = AnnData(np.zeros((3, 4), dtype="float32")) + with pytest.raises(ValueError, match="Cannot estimate overdispersion"): + sc.pp.normalize_clr(adata, alpha="auto") + + +@pytest.mark.parametrize("array_type", ARRAY_TYPES_MEM) +def test_normalize_clr_zero_cell(array_type): + """An empty cell stays all-zero, stays finite, and triggers a warning.""" + x = X_clr.copy() + x[1] = 0 # make the second cell empty + adata = AnnData(array_type(x)) + with pytest.warns(UserWarning, match="Some cells have zero counts"): + sc.pp.normalize_clr(adata) + result = to_ndarray(adata.X) + assert np.isfinite(result).all() + np.testing.assert_allclose(result[1], 0.0, atol=1e-6) + + +def test_normalize_clr_inplace_false(): + adata = AnnData(sparse.csr_matrix(X_clr)) # noqa: TID251 + x_before = to_ndarray(adata.X).copy() + out = sc.pp.normalize_clr(adata, inplace=False) + + assert isinstance(out, dict) + np.testing.assert_allclose(out["X"], _clr_reference(X_clr), rtol=1e-5, atol=1e-5) + # input is left untouched + assert isinstance(adata.X, CSBase) + np.testing.assert_array_equal(to_ndarray(adata.X), x_before) + + +def test_normalize_clr_copy(): + adata = AnnData(sparse.csr_matrix(X_clr)) # noqa: TID251 + returned = sc.pp.normalize_clr(adata, copy=True) + + assert isinstance(returned, AnnData) + assert returned is not adata + np.testing.assert_allclose( + to_ndarray(returned.X), _clr_reference(X_clr), rtol=1e-5, atol=1e-5 + ) + # original is left untouched + assert isinstance(adata.X, CSBase) + + +def test_normalize_clr_copy_inplace_error(): + adata = AnnData(sparse.csr_matrix(X_clr)) # noqa: TID251 + with pytest.raises( + ValueError, match="`copy=True` cannot be used with `inplace=False`" + ): + sc.pp.normalize_clr(adata, copy=True, inplace=False) + + +def test_normalize_clr_layer(): + """`layer` targets that layer and leaves `X` untouched.""" + adata = AnnData( + sparse.csr_matrix(X_clr), # noqa: TID251 + layers={"counts": sparse.csr_matrix(X_clr)}, # noqa: TID251 + ) + x_before = to_ndarray(adata.X).copy() + sc.pp.normalize_clr(adata, layer="counts") + + np.testing.assert_array_equal(to_ndarray(adata.X), x_before) + np.testing.assert_allclose( + to_ndarray(adata.layers["counts"]), + _clr_reference(X_clr), + rtol=1e-5, + atol=1e-5, + ) + + +def test_normalize_clr_view(): + adata = AnnData(X_clr.copy()) + v = adata[:, :] + with pytest.warns(UserWarning, match=r"Received a view"): + sc.pp.normalize_clr(v) + assert not v.is_view diff --git a/tests/test_package_structure.py b/tests/test_package_structure.py index 9f08af6725..95789eca00 100644 --- a/tests/test_package_structure.py +++ b/tests/test_package_structure.py @@ -96,7 +96,9 @@ class ExpectedSig(TypedDict): # other partial exceptions copy_sigs["sc.pp.normalize_total"]["return_ann"] = copy_sigs[ "sc.experimental.pp.normalize_pearson_residuals" -]["return_ann"] = "AnnData | dict[str, np.ndarray] | None" +]["return_ann"] = copy_sigs["sc.pp.normalize_clr"]["return_ann"] = ( + "AnnData | dict[str, np.ndarray] | None" +) copy_sigs["sc.external.pp.magic"]["copy_default"] = None