Source code for moscot.datasets

import os
import pickle
import urllib.request
from itertools import combinations
from types import MappingProxyType
from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Union

import mudata as mu

import networkx as nx
import numpy as np
import pandas as pd
from scipy.linalg import block_diag

import anndata as ad
from scanpy.readwrite import _check_datafile_present_and_download

from moscot._types import PathLike

__all__ = [
    "bone_marrow",
    "c_elegans",
    "drosophila",
    "hspc",
    "mosta",
    "pancreas_multiome",
    "sim_align",
    "simulate_data",
    "zebrafish",
]



[docs]
def mosta(
    path: PathLike = "~/.cache/moscot/mosta.h5ad",
    force_download: bool = False,
    **kwargs: Any,
) -> ad.AnnData:  # pragma: no cover
    """Preprocessed and extracted data as provided in :cite:`chen:22`.

    Includes embryo sections `E9.5`, `E2S1`, `E10.5`, `E2S1`, `E11.5`, `E1S2`.

    The :attr:`anndata.AnnData.X` is based on reprocessing of the counts data using
    :func:`scanpy.pp.normalize_total` and :func:`scanpy.pp.log1p`.

    Parameters
    ----------
    path
        Path where to save the file.
    force_download
        Whether to force-download the data.
    kwargs
        Keyword arguments for :func:`scanpy.read`.

    Returns
    -------
    Annotated data object.
    """
    return _load_dataset_from_url(
        path,
        file_type="h5ad",
        backup_url="https://figshare.com/ndownloader/files/40569779",
        expected_shape=(54134, 2000),
        force_download=force_download,
        **kwargs,
    )




[docs]
def hspc(
    path: PathLike = "~/.cache/moscot/hspc.h5ad",
    force_download: bool = False,
    **kwargs: Any,
) -> ad.AnnData:  # pragma: no cover
    """CD34+ hematopoietic stem and progenitor cells from 4 healthy human donors.

    From the `NeurIPS Multimodal Single-Cell Integration Challenge
    <https://www.kaggle.com/competitions/open-problems-multimodal/data>`_.

    4000 cells were randomly selected after filtering the multiome training data of the donor `31800`.
    Subsequently, the top 2000 highly variable genes were selected. Peaks appearing in less than 5%
    of the cells were filtered out, resulting in 11595 peaks.

    Parameters
    ----------
    path
        Path where to save the file.
    force_download
        Whether to force-download the data.
    kwargs
        Keyword arguments for :func:`scanpy.read`.

    Returns
    -------
    Annotated data object.
    """
    dataset = _load_dataset_from_url(
        path,
        file_type="h5ad",
        backup_url="https://figshare.com/ndownloader/files/37993503",
        expected_shape=(4000, 2000),
        force_download=force_download,
        **kwargs,
    )
    dataset.obs["day"] = dataset.obs["day"].astype("category")  # better solution to this?

    return dataset




[docs]
def drosophila(
    path: PathLike = "~/.cache/moscot/drosophila.h5ad",
    *,
    spatial: bool,
    force_download: bool = False,
    **kwargs: Any,
) -> ad.AnnData:
    """Embryo of Drosophila melanogaster described in :cite:`Li-spatial:22`.

    Minimal pre-processing was performed, such as gene and cell filtering, as well as normalization.

    Parameters
    ----------
    path
        Path where to save the file.
    spatial
        Whether to return the spatial or the scRNA-seq dataset.
    force_download
        Whether to force-download the data.
    kwargs
        Keyword arguments for :func:`scanpy.read`.

    Returns
    -------
    Annotated data object.
    """
    path, _ = os.path.splitext(path)
    if spatial:
        return _load_dataset_from_url(
            path + "_sp.h5ad",
            file_type="h5ad",
            backup_url="https://figshare.com/ndownloader/files/37984935",
            expected_shape=(3039, 82),
            force_download=force_download,
            **kwargs,
        )

    return _load_dataset_from_url(
        path + "_sc.h5ad",
        file_type="h5ad",
        backup_url="https://figshare.com/ndownloader/files/37984938",
        expected_shape=(1297, 2000),
        force_download=force_download,
        **kwargs,
    )




[docs]
def c_elegans(
    path: PathLike = "~/.cache/moscot/c_elegans.h5ad",
    force_download: bool = False,
    **kwargs: Any,
) -> Tuple[ad.AnnData, nx.DiGraph]:  # pragma: no cover
    """scRNA-seq time-series dataset of C.elegans embryogenesis :cite:`packer:19`.

    Contains raw counts of 46,151 cells with at least partial lineage information.
    In addition, this downloads the known C. elegans lineage tree.

    Parameters
    ----------
    path
        Path where to save the file.
    force_download
        Whether to force-download the data.
    kwargs
        Keyword arguments for :func:`scanpy.read`.

    Returns
    -------
    Annotated data object and the lineage tree.
    """
    adata = _load_dataset_from_url(
        path,
        file_type="h5ad",
        backup_url="https://figshare.com/ndownloader/files/39943585",
        expected_shape=(46151, 20222),
        force_download=force_download,
        **kwargs,
    )

    with urllib.request.urlopen("https://figshare.com/ndownloader/files/43064842") as fin:
        tree = nx.read_gml(fin)

    return adata, tree




[docs]
def zebrafish(
    path: PathLike = "~/.cache/moscot/zebrafish.h5ad",
    force_download: bool = False,
    **kwargs: Any,
) -> Tuple[ad.AnnData, Dict[str, nx.DiGraph]]:
    """Lineage-traced scRNA-seq time-series dataset of Zebrafish heart regeneration :cite:`hu:22`.

    Contains gene expression vectors, LINNAEUS :cite:`spanjaard:18` reconstructed lineage trees,
    a low-dimensional embedding, and additional metadata.

    Parameters
    ----------
    path
        Path where to save the file.
    force_download
        Whether to force-download the data.
    kwargs
        Keyword arguments for :func:`scanpy.read`.

    Returns
    -------
    Annotated data object and the lineage trees.
    """
    adata = _load_dataset_from_url(
        path,
        file_type="h5ad",
        backup_url="https://figshare.com/ndownloader/files/39951073",
        expected_shape=(44014, 31466),
        force_download=force_download,
        **kwargs,
    )
    # TODO(michalk8): also cache or store in AnnData ad Newick + reconstruct?
    with urllib.request.urlopen("https://figshare.com/ndownloader/files/39951076") as fin:
        trees = pickle.load(fin)

    return adata, trees




[docs]
def bone_marrow(
    path: PathLike = "~/.cache/moscot/bone_marrow.h5ad",
    *,
    rna: bool,
    force_download: bool = False,
    **kwargs: Any,
) -> ad.AnnData:
    """Multiome data of bone marrow measurements :cite:`luecken:21`.

    Contains processed counts of 6,224 cells. The RNA data was filtered to 2,000 top
    highly variable genes, the ATAC data was filtered to 8,000 top highly variable
    peaks.

    Parameters
    ----------
    path
        Path where to save the file.
    rna
        Return the RNA data If :obj:`True`, otherwise return ATAC data.
    force_download
        Whether to force-download the data.
    kwargs
        Keyword arguments for :func:`scanpy.read`.

    Returns
    -------
    Annotated data object.
    """
    path, _ = os.path.splitext(path)
    if rna:
        return _load_dataset_from_url(
            path + "_rna.h5ad",
            file_type="h5ad",
            backup_url="https://figshare.com/ndownloader/files/40195114",
            expected_shape=(6224, 2000),
            force_download=force_download,
            **kwargs,
        )
    return _load_dataset_from_url(
        path + "_atac.h5ad",
        file_type="h5ad",
        backup_url="https://figshare.com/ndownloader/files/41013551",
        expected_shape=(6224, 8000),
        force_download=force_download,
        **kwargs,
    )




[docs]
def pancreas_multiome(
    rna_only: bool,
    path: PathLike = "~/.cache/moscot/pancreas_multiome.h5mu",
    force_download: bool = True,
    **kwargs: Any,
) -> Union[mu.MuData, ad.AnnData]:  # pragma: no cover
    """Pancreatic endocrinogenesis dataset published with the moscot manuscript :cite:`klein:25`.

    The dataset contains paired scRNA-seq and scATAC-seq data of pancreatic cells at embryonic days 14.5, 15.5, and
    16.5. The data was preprocessed and filtered as described in the manuscript, the raw data and the full processed
    data are available via `GEO <https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi>`_ accession code GSE275562.

    Parameters
    ----------
    rna_only
        Only load the RNA data, resulting in a smaller file.
    path
        Path where to save the file.
    force_download
        Whether to force-download the data.
    kwargs
        Keyword arguments for :func:`anndata.io.read_h5ad` if `rna_only`, else for :func:`mudata.read`.

    Returns
    -------
        :class:`mudata.MuData` object with RNA and ATAC data if `rna_only`, else :class:`anndata.AnnData` with RNA only.
    """
    if rna_only:
        return _load_dataset_from_url(
            path,
            file_type="h5ad",
            backup_url="https://figshare.com/ndownloader/files/49725087",
            expected_shape=(22604, 20242),
            force_download=force_download,
            **kwargs,
        )
    return _load_dataset_from_url(
        path,
        file_type="h5mu",
        backup_url="https://figshare.com/ndownloader/files/48782332",
        expected_shape=(22604, 271918),
        force_download=force_download,
    )




[docs]
def tedsim(
    path: PathLike = "~/.cache/moscot/tedsim.h5ad",
    force_download: bool = False,
    **kwargs: Any,
) -> ad.AnnData:  # pragma: no cover
    """Dataset simulated with TedSim :cite:`pan:22`.

    Simulated scRNA-seq dataset of a differentiation trajectory. For each cell, the dataset includes a (raw counts)
    gene expression vector as well as a lineage barcodes. The data was simulated with asymmetric division rate of
    :math:`0.2`, intermediate state step size of :math:`0.2` and contains the following fields:

    - :attr:`obsm['barcodes'] <anndata.AnnData.obsm>` - barcodes.
    - :attr:`obsp['cost_matrices'] <anndata.AnnData.obsp>` - precomputed lineage cost matrices.
    - :attr:`uns['tree'] <anndata.AnnData.uns>` - lineage tree in the
      `Newick format <https://en.wikipedia.org/wiki/Newick_format>`_.
    - :attr:`uns['couplings' ] <anndata.AnnData.uns>` - coupling matrix based on the ground-truth lineage tree.

    Parameters
    ----------
    path
        Path where to save the file.
    force_download
        Whether to force-download the data.
    kwargs
        Keyword arguments for :func:`scanpy.read`.

    Returns
    -------
    Annotated data object.
    """
    return _load_dataset_from_url(
        path,
        file_type="h5ad",
        backup_url="https://figshare.com/ndownloader/files/40178644",
        expected_shape=(8448, 500),
        force_download=force_download,
        **kwargs,
    )




[docs]
def sciplex(
    path: PathLike = "~/.cache/moscot/sciplex.h5ad",
    force_download: bool = False,
    **kwargs: Any,
) -> ad.AnnData:  # pragma: no cover
    """Perturbation dataset published in :cite:`srivatsan:20`.

    Transcriptomes of A549, K562, and mCF7 cells exposed to 188 compounds.
    Data obtained from :cite:`scperturb:24`.

    Parameters
    ----------
    path
        Path where to save the file.
    force_download
        Whether to force-download the data.
    kwargs
        Keyword arguments for :func:`scanpy.read`.

    Returns
    -------
    Annotated data object.
    """
    return _load_dataset_from_url(
        path,
        file_type="h5ad",
        backup_url="https://figshare.com/ndownloader/files/43381398",
        expected_shape=(799317, 110984),
        force_download=force_download,
        **kwargs,
    )




[docs]
def sim_align(
    path: PathLike = "~/.cache/moscot/sim_align.h5ad",
    force_download: bool = False,
    **kwargs: Any,
) -> ad.AnnData:  # pragma: no cover
    """Spatial transcriptomics simulated dataset as described in :cite:`Jones-spatial:22`.

    Parameters
    ----------
    path
        Location where the file is saved to.
    force_download
        Whether to force-download the data.
    kwargs
        Keyword arguments for :func:`scanpy.read`.

    Returns
    -------
    Annotated data object.
    """
    return _load_dataset_from_url(
        path,
        file_type="h5ad",
        backup_url="https://figshare.com/ndownloader/files/37984926",
        expected_shape=(1200, 500),
        force_download=force_download,
        **kwargs,
    )




[docs]
def simulate_data(
    n_distributions: int = 2,
    cells_per_distribution: int = 20,
    n_genes: int = 60,
    key: Literal["day", "batch"] = "batch",
    var: float = 1.0,
    obs_to_add: Mapping[str, Any] = MappingProxyType({"celltype": 3}),
    marginals: Optional[Tuple[str, str]] = None,
    seed: int = 0,
    quad_term: Optional[Literal["tree", "barcode", "spatial"]] = None,
    lin_cost_matrix: Optional[str] = None,
    quad_cost_matrix: Optional[str] = None,
    **kwargs: Any,
) -> ad.AnnData:
    """Simulate data.

    This function is used to generate data, mainly for the purpose of
    demonstrating certain functionalities of :mod:`moscot`.

    Parameters
    ----------
    n_distributions
        Number of distributions defined by `key`.
    cells_per_distribution
        Number of cells per distribution.
    n_genes
        Number of genes per simulated cell.
    key
        Key to identify distribution allocation.
    var
        Variance of one cell distribution
    obs_to_add
        Dictionary of names to add to columns of :attr:`anndata.AnnData.obs`
        and number of different values for this column.
    marginals
        Column names of :attr:`anndata.AnnData.obs` where to save the randomly
        generated marginals. If `None`, no marginals are generated.
    seed
        Random seed.
    quad_term
        Literal indicating whether to add costs corresponding to a specific problem setting.
        If `None`, no quadratic cost element is generated.
    lin_cost_matrix
        Key where to save the linear cost matrix. It is generated according to the pairwise policy.
        If `None`, no linear cost matrix is generated.
    quad_cost_matrix
        Key where to save the quadratic cost matrices. If `None`, no quadratic cost matrix is generated.

    Returns
    -------
    :class:`anndata.AnnData`.
    """
    rng = np.random.RandomState(seed)
    adatas = [
        ad.AnnData(
            X=rng.multivariate_normal(
                mean=kwargs.pop("mean", np.arange(n_genes)),
                cov=kwargs.pop("cov", var * np.diag(np.ones(n_genes))),
                size=cells_per_distribution,
            ),
        )
        for _ in range(n_distributions)
    ]
    adata = ad.concat(adatas, label=key, index_unique="-")
    if key == "day":
        adata.obs["day"] = pd.to_numeric(adata.obs["day"]).astype("category")
    for k, val in obs_to_add.items():
        adata.obs[k] = rng.choice(range(val), len(adata))
    if marginals:
        adata.obs[marginals[0]] = rng.uniform(1e-5, 1, len(adata))
        adata.obs[marginals[1]] = rng.uniform(1e-5, 1, len(adata))
    if quad_term == "tree":
        adata.uns["trees"] = {}
        for i in range(n_distributions):
            adata.uns["trees"][i] = _get_random_trees(
                n_leaves=cells_per_distribution,
                n_trees=1,
                n_initial_nodes=kwargs.pop("n_initial_nodes", 5),
                leaf_names=[adata[adata.obs[key] == i].obs_names],
                # TODO(michalk8): fix the seed
                seed=seed,
            )[0]
    if quad_term == "spatial":
        dim = kwargs.pop("spatial_dim", 2)
        adata.obsm["spatial"] = rng.normal(size=(adata.n_obs, dim))
    if quad_term == "barcode":
        n_intBCs = kwargs.pop("n_intBCs", 20)
        barcode_dim = kwargs.pop("barcode_dim", 10)
        adata.obsm["barcode"] = rng.choice(n_intBCs, size=(adata.n_obs, barcode_dim))
    if lin_cost_matrix is not None:
        adata.uns[lin_cost_matrix] = {}
        for i, j in combinations(range(n_distributions), 2):
            adata.uns[lin_cost_matrix][(str(i), str(j))] = np.abs(
                rng.normal(size=(cells_per_distribution, cells_per_distribution))
            )
    if quad_cost_matrix is not None:
        quad_costs = [
            (np.abs(rng.normal(size=(cells_per_distribution, cells_per_distribution)))) for i in range(n_distributions)
        ]
        quad_cm = block_diag(*quad_costs)
        np.fill_diagonal(quad_cm, 0)
        adata.obsp[quad_cost_matrix] = quad_cm
    return adata



def _load_dataset_from_url(
    fpath: PathLike,
    file_type: Literal["h5ad", "h5mu"],
    *,
    backup_url: str,
    expected_shape: Tuple[int, int],
    force_download: bool = False,
    **kwargs: Any,
) -> Union[ad.AnnData, mu.MuData]:
    # TODO: make nicer once https://github.com/scverse/mudata/issues/76 resolved
    fpath = os.path.expanduser(fpath)
    assert file_type in ["h5ad", "h5mu"], f"Invalid type `{file_type}`. Must be one of `['h5ad', 'h5mu']`."
    if not fpath.endswith(file_type):
        fpath += f".{file_type}"
    if force_download and os.path.exists(fpath):
        os.remove(fpath)
    if not _check_datafile_present_and_download(backup_url=backup_url, path=fpath):
        raise FileNotFoundError(f"File `{fpath}` not found or download failed.")
    data = ad.read_h5ad(filename=fpath, **kwargs) if file_type == "h5ad" else mu.read_h5mu(filename=fpath, backed=False)

    if data.shape != expected_shape:
        data_str = "MuData" if file_type == "h5mu" else "AnnData"
        raise ValueError(f"Expected {data_str} object to have shape `{expected_shape}`, found `{data.shape}`.")

    return data


def _get_random_trees(
    n_leaves: int, n_trees: int, n_initial_nodes: int = 50, leaf_names: Optional[List[List[str]]] = None, seed: int = 42
) -> List[nx.DiGraph]:
    rng = np.random.RandomState(42)
    if leaf_names is not None:
        assert len(leaf_names) == n_trees
        for i in range(n_trees):
            assert len(leaf_names[i]) == n_leaves
    trees = []
    for tree_idx in range(n_trees):
        tempG = nx.random_labeled_tree(n_initial_nodes, seed=seed)
        G = nx.DiGraph()
        G.add_edges_from(tempG.edges)
        leaves = [x for x in G.nodes() if G.out_degree(x) == 0 and G.in_degree(x) == 1]
        inner_nodes = list(set(G.nodes()) - set(leaves))
        leaves_updated = leaves.copy()
        for i in range(n_leaves - len(leaves)):
            G.add_node(n_initial_nodes + i)
            G.add_edge(rng.choice(inner_nodes, 1)[0], n_initial_nodes + i)
            leaves_updated.append(n_initial_nodes + i)
        assert len(leaves_updated) == n_leaves
        if leaf_names is not None:
            relabel_dict = {leaves_updated[i]: leaf_names[tree_idx][i] for i in range(len(leaves_updated))}
            G = nx.relabel_nodes(G, relabel_dict)
        trees.append(G)

    return trees