Source code for singlecell_cookbook.tools.preprocess._subset

from typing import Sequence

import numpy as np
import pandas as pd
from anndata import AnnData


[docs] def subset_obs( adata: AnnData, subset: pd.Index | Sequence[str | int | bool], ) -> None: """Subset observations (rows) in an AnnData object. This function modifies the AnnData object in-place by selecting a subset of observations based on the provided subset parameter. The subsetting can be done using observation names, integer indices, a boolean mask, or a pandas Index. Parameters ---------- adata : AnnData The annotated data matrix to subset. Will be modified in-place. subset : pd.Index | Sequence[str | int | bool] The subset specification. Can be one of: * A pandas Index containing observation names * A sequence of observation names (strings) * A sequence of integer indices * A boolean mask of length `adata.n_obs` Examples -------- >>> # Create an example AnnData object >>> import anndata >>> import pandas as pd >>> import numpy as np >>> adata = anndata.AnnData( ... X=np.array([[1, 2], [3, 4], [5, 6]]), ... obs=pd.DataFrame(index=['A', 'B', 'C']), ... var=pd.DataFrame(index=['gene1', 'gene2']) ... ) >>> # Subset using pandas Index >>> subset_obs(adata, pd.Index(['B', 'C'])) >>> adata.obs_names.tolist() ['B', 'C'] >>> # Subset using observation names >>> subset_obs(adata, ['A', 'B']) >>> adata.obs_names.tolist() ['A', 'B'] >>> # Subset using integer indices >>> subset_obs(adata, [0, 1]) >>> adata.obs_names.tolist() ['A', 'B'] >>> # Subset using boolean mask >>> subset_obs(adata, [True, False, True]) >>> adata.obs_names.tolist() ['A', 'C'] Notes ----- - The function modifies the AnnData object in-place - When using a boolean mask, its length must match the number of observations - When using integer indices, they must be valid indices for the observations - Invalid observation names or indices will raise KeyError or IndexError respectively - The order of observations in the output will match the order in the subset parameter """ if isinstance(subset, pd.Index): adata._inplace_subset_obs(subset) return subset = np.array(subset) # Handle boolean mask if subset.dtype.kind == "b": if len(subset) != adata.n_obs: raise IndexError( f"Boolean mask length ({len(subset)}) does not match number of " f"observations ({adata.n_obs})" ) subset = adata.obs_names[subset] # Handle integer indices elif subset.dtype.kind in "iu": if np.any(subset < 0) or np.any(subset >= adata.n_obs): raise IndexError(f"Integer indices must be between 0 and {adata.n_obs - 1}") subset = adata.obs_names[subset] adata._inplace_subset_obs(subset)
[docs] def subset_var( adata: AnnData, subset: pd.Index | Sequence[str | int | bool], ) -> None: """Subset variables (columns) in an AnnData object. This function modifies the AnnData object in-place by selecting a subset of variables based on the provided subset parameter. The subsetting can be done using variable names, integer indices, a boolean mask, or a pandas Index. Parameters ---------- adata : AnnData The annotated data matrix to subset. Will be modified in-place. subset : pd.Index | Sequence[str | int | bool] The subset specification. Can be one of: * A pandas Index containing variable names * A sequence of variable names (strings) * A sequence of integer indices * A boolean mask of length `adata.n_vars` Examples -------- >>> # Create an example AnnData object >>> import anndata >>> import pandas as pd >>> import numpy as np >>> adata = anndata.AnnData( ... X=np.array([[1, 2, 3], [4, 5, 6]]), ... obs=pd.DataFrame(index=['cell1', 'cell2']), ... var=pd.DataFrame(index=['gene1', 'gene2', 'gene3']) ... ) >>> # Subset using pandas Index >>> subset_var(adata, pd.Index(['gene2', 'gene3'])) >>> adata.var_names.tolist() ['gene2', 'gene3'] >>> # Subset using variable names >>> subset_var(adata, ['gene1', 'gene2']) >>> adata.var_names.tolist() ['gene1', 'gene2'] >>> # Subset using integer indices >>> subset_var(adata, [0, 1]) >>> adata.var_names.tolist() ['gene1', 'gene2'] >>> # Subset using boolean mask >>> subset_var(adata, [True, False, True]) >>> adata.var_names.tolist() ['gene1', 'gene3'] Notes ----- - The function modifies the AnnData object in-place - When using a boolean mask, its length must match the number of variables - When using integer indices, they must be valid indices for the variables - Invalid variable names or indices will raise KeyError or IndexError respectively - The order of variables in the output will match the order in the subset parameter """ if isinstance(subset, pd.Index): adata._inplace_subset_var(subset) return subset = np.array(subset) # Handle boolean mask if subset.dtype.kind == "b": if len(subset) != adata.n_vars: raise IndexError( f"Boolean mask length ({len(subset)}) does not match number of " f"variables ({adata.n_vars})" ) subset = adata.var_names[subset] # Handle integer indices elif subset.dtype.kind in "iu": if np.any(subset < 0) or np.any(subset >= adata.n_vars): raise IndexError( f"Integer indices must be between 0 and {adata.n_vars - 1}" ) subset = adata.var_names[subset] adata._inplace_subset_var(subset)