Hello,
I am new to the scverse, and I am trying to annotate a query dataset with popv.
Dataset info
My query dataset is an AnnData object of publicly available 10X mouse pancreatic islets that I downloaded from the SRA. The datasets were merged after minimal processing (ambient RNA removal with DecontX
, and doublet removal with DoubletFinder
.
For the model, I am using popV/tabula_muris_Pancreas_10x
from HuggingFace
.
*To make the code cells more readable, I added the >
symbol at the beginning of executed lines of code
# Step 2: Load your data (User Action Required)
> query_adata
AnnData object with n_obs × n_vars = 464747 × 27440
obs: 'run', 'total_counts', 'n_genes_by_counts', 'sex', 'pct_counts_mt', 'cell_type', 'n_genes', 'doublet_score', 'predicted_doublet'
uns: 'log1p'
layers: 'counts'
# Preprocess the dataset
> query_adata.obs_names_make_unique()
> query_adata.var_names_make_unique()
> query_adata.X = query_adata.layers["counts"]
>
> query_adata.X.data
array([ 1., 1., 2., ..., 1., 18., 3.], dtype=float32)
Issue
I am getting stuck at step 4 of the example notebook where the execution stops with AssertionError: Don’t call _normalize_index with non-categorical/string names
.
I put in safeguards to make sure my indices are the required data type, but it looks like the error pops up all the same. I am not sure if there’s something that I am missing in the dataset prep, or if this is just a bug.
# Step 3 (User Action Required): Setting Up Annotation Parameters
> huggingface_repo = "popV/tabula_muris_Pancreas_10x"
> query_batch_key = "run"
> algorithms = None
>
> query_adata.var['feature_name'] = query_adata.var_names.astype(str) # I added this line as the query_data has gene symbols
# Step 4: Perform annotation
> import numba
>
> numba.__version__
'0.61.0'
> hmo = popv.hub.HubModel.pull_from_huggingface_hub(huggingface_repo, cache_dir="tmp/tabula_muris")
Fetching 22 files: 100%
22/22 [00:02<00:00, 1.95it/s]
OnClass.data-00000-of-00001: 100%
27.2M/27.2M [00:00<00:00, 73.9MB/s]
OnClass.index: 100%
222/222 [00:00<00:00, 20.3kB/s]
.gitattributes: 100%
1.64k/1.64k [00:00<00:00, 84.3kB/s]
OnClass.meta: 100%
67.6k/67.6k [00:00<00:00, 1.42MB/s]
accuracies.json: 100%
2.15k/2.15k [00:00<00:00, 98.4kB/s]
OnClass.npz: 100%
144M/144M [00:02<00:00, 65.3MB/s]
celltypist.pkl: 100%
325k/325k [00:00<00:00, 3.45MB/s]
README.md: 100%
6.51k/6.51k [00:00<00:00, 730kB/s]
harmony_knn_classifier.joblib: 100%
4.38M/4.38M [00:00<00:00, 17.4MB/s]
checkpoint: 100%
71.0/71.0 [00:00<00:00, 8.09kB/s]
minified_ref_adata.h5ad: 100%
10.9M/10.9M [00:00<00:00, 33.9MB/s]
obo_dag.joblib: 100%
321k/321k [00:00<00:00, 4.98MB/s]
metadata.json: 100%
858/858 [00:00<00:00, 61.5kB/s]
preprocessing.json: 100%
120k/120k [00:00<00:00, 4.67MB/s]
pynndescent_index.joblib: 100%
4.54M/4.54M [00:00<00:00, 43.8MB/s]
predictions.csv: 100%
173k/173k [00:00<00:00, 2.74MB/s]
svm_classifier.joblib: 100%
1.45M/1.45M [00:00<00:00, 32.9MB/s]
model.pt: 100%
10.9M/10.9M [00:00<00:00, 69.6MB/s]
model.pt: 100%
11.4M/11.4M [00:00<00:00, 66.4MB/s]
svm_classifier_cuml.joblib: 100%
155k/155k [00:00<00:00, 5.01MB/s]
scvi_knn_classifier.joblib: 100%
2.40M/2.40M [00:00<00:00, 14.1MB/s]
xgboost_classifier.model: 100%
2.16M/2.16M [00:00<00:00, 17.3MB/s]
Fetching 6 files: 100%
6/6 [00:01<00:00, 2.75it/s]
cl_popv.json: 100%
38.1M/38.1M [00:00<00:00, 115MB/s]
cl.ontology.nlp.emb: 100%
155M/155M [00:01<00:00, 156MB/s]
cl.json: 100%
32.4M/32.4M [00:00<00:00, 116MB/s]
README.md: 100%
33.0/33.0 [00:00<00:00, 1.14kB/s]
cl.ontology: 100%
214k/214k [00:00<00:00, 2.70MB/s]
.gitattributes: 100%
2.61k/2.61k [00:00<00:00, 106kB/s]
README.md: 100%
6.51k/6.51k [00:00<00:00, 785kB/s]
# The two lines below were added to try and mitigate the assertion error
> query_adata.obs.index = query_adata.obs.index.astype(str)
> query_adata.var.index = query_adata.var.index.astype(str)
> adata = hmo.annotate_data(query_adata, query_batch_key=query_batch_key, prediction_mode="inference", gene_symbols="feature_name")
INFO:cellxgene_census:The "stable" release is currently 2025-01-30. Specify 'census_version="2025-01-30"' in future calls to open_soma() to ensure data consistency.
SSSSSS
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-41-41e85ddb159a> in <cell line: 0>()
----> 1 adata = hmo.annotate_data(
2 query_adata,
3 query_batch_key=query_batch_key,
4 prediction_mode="inference", # "fast" does not integrate reference and query.
5 gene_symbols="feature_name", # "Uncomment if using gene symbols."
5 frames
/usr/local/lib/python3.11/dist-packages/popv/hub/_model.py in annotate_data(self, query_adata, query_batch_key, save_path, prediction_mode, methods, gene_symbols)
144 if gene_symbols is not None:
145 print("SSSSSS")
--> 146 query_adata = self.map_genes(adata=query_adata, gene_symbols=gene_symbols)
147 print("LLLLLL", self.local_dir, os.listdir(self.local_dir))
148
/usr/local/lib/python3.11/dist-packages/popv/hub/_model.py in map_genes(self, adata, gene_symbols)
386 adata.var["old_index"] = adata.var_names
387 adata.var_names = adata.var_names.map(feature_dict)
--> 388 adata = adata[:, adata.var.index.notna()].copy()
389 return adata
/usr/local/lib/python3.11/dist-packages/anndata/_core/anndata.py in __getitem__(self, index)
1009 def __getitem__(self, index: Index) -> AnnData:
1010 """Returns a sliced view of the object."""
-> 1011 oidx, vidx = self._normalize_indices(index)
1012 return AnnData(self, oidx=oidx, vidx=vidx, asview=True)
1013
/usr/local/lib/python3.11/dist-packages/anndata/_core/anndata.py in _normalize_indices(self, index)
990
991 def _normalize_indices(self, index: Index | None) -> tuple[slice, slice]:
--> 992 return _normalize_indices(index, self.obs_names, self.var_names)
993
994 # TODO: this is not quite complete...
/usr/local/lib/python3.11/dist-packages/anndata/_core/index.py in _normalize_indices(index, names0, names1)
31 ax0, ax1 = unpack_index(index)
32 ax0 = _normalize_index(ax0, names0)
---> 33 ax1 = _normalize_index(ax1, names1)
34 return ax0, ax1
35
/usr/local/lib/python3.11/dist-packages/anndata/_core/index.py in _normalize_index(indexer, index)
47 if not isinstance(index, pd.RangeIndex):
48 msg = "Don’t call _normalize_index with non-categorical/string names"
---> 49 assert index.dtype != float, msg
50 assert index.dtype != int, msg
51
AssertionError: Don’t call _normalize_index with non-categorical/string names