from typing import Sequence
import numpy as np
import pandas as pd
from anndata import AnnData
[docs]
def subset_obs(
adata: AnnData,
subset: pd.Index | Sequence[str | int | bool],
) -> None:
"""Subset observations (rows) in an AnnData object.
This function modifies the AnnData object in-place by selecting a subset of observations
based on the provided subset parameter. The subsetting can be done using observation
names, integer indices, a boolean mask, or a pandas Index.
Parameters
----------
adata : AnnData
The annotated data matrix to subset. Will be modified in-place.
subset : pd.Index | Sequence[str | int | bool]
The subset specification. Can be one of:
* A pandas Index containing observation names
* A sequence of observation names (strings)
* A sequence of integer indices
* A boolean mask of length `adata.n_obs`
Examples
--------
>>> # Create an example AnnData object
>>> import anndata
>>> import pandas as pd
>>> import numpy as np
>>> adata = anndata.AnnData(
... X=np.array([[1, 2], [3, 4], [5, 6]]),
... obs=pd.DataFrame(index=['A', 'B', 'C']),
... var=pd.DataFrame(index=['gene1', 'gene2'])
... )
>>> # Subset using pandas Index
>>> subset_obs(adata, pd.Index(['B', 'C']))
>>> adata.obs_names.tolist()
['B', 'C']
>>> # Subset using observation names
>>> subset_obs(adata, ['A', 'B'])
>>> adata.obs_names.tolist()
['A', 'B']
>>> # Subset using integer indices
>>> subset_obs(adata, [0, 1])
>>> adata.obs_names.tolist()
['A', 'B']
>>> # Subset using boolean mask
>>> subset_obs(adata, [True, False, True])
>>> adata.obs_names.tolist()
['A', 'C']
Notes
-----
- The function modifies the AnnData object in-place
- When using a boolean mask, its length must match the number of observations
- When using integer indices, they must be valid indices for the observations
- Invalid observation names or indices will raise KeyError or IndexError respectively
- The order of observations in the output will match the order in the subset parameter
"""
if isinstance(subset, pd.Index):
adata._inplace_subset_obs(subset)
return
subset = np.array(subset)
# Handle boolean mask
if subset.dtype.kind == "b":
if len(subset) != adata.n_obs:
raise IndexError(
f"Boolean mask length ({len(subset)}) does not match number of "
f"observations ({adata.n_obs})"
)
subset = adata.obs_names[subset]
# Handle integer indices
elif subset.dtype.kind in "iu":
if np.any(subset < 0) or np.any(subset >= adata.n_obs):
raise IndexError(f"Integer indices must be between 0 and {adata.n_obs - 1}")
subset = adata.obs_names[subset]
adata._inplace_subset_obs(subset)
[docs]
def subset_var(
adata: AnnData,
subset: pd.Index | Sequence[str | int | bool],
) -> None:
"""Subset variables (columns) in an AnnData object.
This function modifies the AnnData object in-place by selecting a subset of variables
based on the provided subset parameter. The subsetting can be done using variable
names, integer indices, a boolean mask, or a pandas Index.
Parameters
----------
adata : AnnData
The annotated data matrix to subset. Will be modified in-place.
subset : pd.Index | Sequence[str | int | bool]
The subset specification. Can be one of:
* A pandas Index containing variable names
* A sequence of variable names (strings)
* A sequence of integer indices
* A boolean mask of length `adata.n_vars`
Examples
--------
>>> # Create an example AnnData object
>>> import anndata
>>> import pandas as pd
>>> import numpy as np
>>> adata = anndata.AnnData(
... X=np.array([[1, 2, 3], [4, 5, 6]]),
... obs=pd.DataFrame(index=['cell1', 'cell2']),
... var=pd.DataFrame(index=['gene1', 'gene2', 'gene3'])
... )
>>> # Subset using pandas Index
>>> subset_var(adata, pd.Index(['gene2', 'gene3']))
>>> adata.var_names.tolist()
['gene2', 'gene3']
>>> # Subset using variable names
>>> subset_var(adata, ['gene1', 'gene2'])
>>> adata.var_names.tolist()
['gene1', 'gene2']
>>> # Subset using integer indices
>>> subset_var(adata, [0, 1])
>>> adata.var_names.tolist()
['gene1', 'gene2']
>>> # Subset using boolean mask
>>> subset_var(adata, [True, False, True])
>>> adata.var_names.tolist()
['gene1', 'gene3']
Notes
-----
- The function modifies the AnnData object in-place
- When using a boolean mask, its length must match the number of variables
- When using integer indices, they must be valid indices for the variables
- Invalid variable names or indices will raise KeyError or IndexError respectively
- The order of variables in the output will match the order in the subset parameter
"""
if isinstance(subset, pd.Index):
adata._inplace_subset_var(subset)
return
subset = np.array(subset)
# Handle boolean mask
if subset.dtype.kind == "b":
if len(subset) != adata.n_vars:
raise IndexError(
f"Boolean mask length ({len(subset)}) does not match number of "
f"variables ({adata.n_vars})"
)
subset = adata.var_names[subset]
# Handle integer indices
elif subset.dtype.kind in "iu":
if np.any(subset < 0) or np.any(subset >= adata.n_vars):
raise IndexError(
f"Integer indices must be between 0 and {adata.n_vars - 1}"
)
subset = adata.var_names[subset]
adata._inplace_subset_var(subset)