Problem with airr data loading

Hi, I am trying to use scirpy to do scRNAseq data analysis together with TCR analysis. I have the following issue, could you help me please?
Thank you for your help.

import tarfile
import warnings
from glob import glob
import anndata
import muon as mu
import numpy as np
import pandas as pd
import scanpy as sc
import scirpy as ir
from cycler import cycler
from matplotlib import cm as mpl_cm
from matplotlib import pyplot as plt

sc.set_figure_params(figsize=(4, 4))
sc.settings.verbosity = 2  # verbosity: errors (0), warnings (1), info (2), hints (3)

# Load the associated transcriptomics data
adata = sc.read('immune_withoutX.h5ad')
adata.uns['log1p']["base"] = None
adata.var_names_make_unique()
# Load the TCR data
adata_tcr = ir.io.read_10x_vdj(
    "filtered_contig_annotations.csv"
)
adata_tcr.shape
(272, 0)
adata.shape
(5954, 14191)
mdata = mu.MuData({"gex": adata, "airr": adata_tcr})
mdata.obs_names_make_unique()
sc.pp.log1p(mdata["gex"])
sc.pp.pca(mdata["gex"], svd_solver="arpack")
sc.pp.neighbors(mdata["gex"])
sc.tl.umap(mdata["gex"])
WARNING: adata.X seems to be already log-transformed.
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished (0:00:00)
computing UMAP
    finished (0:00:10)
ir.pp.index_chains(mdata)
ir.tl.chain_qc(mdata)
100% 1/1 [00:00<00:00, 30.74it/s]
Stored result in `mdata.obs["airr:receptor_type"]`. Stored result in `mdata.obs["airr:receptor_subtype"]`. Stored result in `mdata.obs["airr:chain_pairing"]`.

fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(10, 4), gridspec_kw={"wspace": 0.5})
mu.pl.embedding(mdata, basis="gex:umap", color=["Cd3e"], ax=ax0, show=False)
mu.pl.embedding(mdata, basis="gex:umap", color=["airr:receptor_type"], ax=ax1)
---------------------------------------------------------------------------
StopIteration                             Traceback (most recent call last)
Cell In[42], line 3
      1 fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(10, 4), gridspec_kw={"wspace": 0.5})
      2 mu.pl.embedding(mdata, basis="gex:umap", color=["Cd3e"], ax=ax0, show=False)
----> 3 mu.pl.embedding(mdata, basis="gex:umap", color=["airr:receptor_type"], ax=ax1)

File ~\Anaconda3\lib\site-packages\muon\_core\plot.py:264, in embedding(data, basis, color, use_raw, layer, **kwargs)
    261     color = [mod_key_modifier[k] for k in keys]
    263 ad = AnnData(obs=obs, obsm=adata.obsm, obsp=adata.obsp, uns=adata.uns)
--> 264 retval = sc.pl.embedding(ad, basis=basis_mod, color=color, **kwargs)
    265 for key, col in zip(keys, color):
    266     try:

File ~\Anaconda3\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:392, in embedding(adata, basis, color, gene_symbols, use_raw, sort_order, edges, edges_width, edges_color, neighbors_key, arrows, arrows_kwds, groups, components, dimensions, layer, projection, scale_factor, color_map, cmap, palette, na_color, na_in_legend, size, frameon, legend_fontsize, legend_fontweight, legend_loc, legend_fontoutline, colorbar_loc, vmax, vmin, vcenter, norm, add_outline, outline_width, outline_color, ncols, hspace, wspace, title, show, save, ax, return_fig, **kwargs)
    389         # if user did not set alpha, set alpha to 0.7
    390         kwargs['alpha'] = 0.7 if alpha is None else alpha
--> 392     cax = scatter(
    393         coords[:, 0],
    394         coords[:, 1],
    395         marker=".",
    396         c=color_vector,
    397         rasterized=settings._vector_friendly,
    398         norm=normalize,
    399         **kwargs,
    400     )
    402 # remove y and x ticks
    403 ax.set_yticks([])

File ~\Anaconda3\lib\site-packages\matplotlib\__init__.py:1442, in _preprocess_data.<locals>.inner(ax, data, *args, **kwargs)
   1439 @functools.wraps(func)
   1440 def inner(ax, *args, data=None, **kwargs):
   1441     if data is None:
-> 1442         return func(ax, *map(sanitize_sequence, args), **kwargs)
   1444     bound = new_sig.bind(ax, *args, **kwargs)
   1445     auto_label = (bound.arguments.get(label_namer)
   1446                   or bound.kwargs.get(label_namer))

File ~\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py:4602, in Axes.scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, edgecolors, plotnonfinite, **kwargs)
   4599 if edgecolors is None:
   4600     orig_edgecolor = kwargs.get('edgecolor', None)
   4601 c, colors, edgecolors = \
-> 4602     self._parse_scatter_color_args(
   4603         c, edgecolors, kwargs, x.size,
   4604         get_next_color_func=self._get_patches_for_fill.get_next_color)
   4606 if plotnonfinite and colors is None:
   4607     c = np.ma.masked_invalid(c)

File ~\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py:4400, in Axes._parse_scatter_color_args(c, edgecolors, kwargs, xsize, get_next_color_func)
   4393 if c is None:
   4394     c = (facecolors if facecolors is not None
   4395          else "b" if mpl.rcParams['_internal.classic_mode']
   4396          else get_next_color_func())
   4397 c_is_string_or_strings = (
   4398     isinstance(c, str)
   4399     or (np.iterable(c) and len(c) > 0
-> 4400         and isinstance(cbook._safe_first_finite(c), str)))
   4402 def invalid_shape_exception(csize, xsize):
   4403     return ValueError(
   4404         f"'c' argument has {csize} elements, which is inconsistent "
   4405         f"with 'x' and 'y' with size {xsize}.")

File ~\Anaconda3\lib\site-packages\matplotlib\cbook\__init__.py:1715, in _safe_first_finite(obj, skip_nonfinite)
   1712     raise RuntimeError("matplotlib does not "
   1713                        "support generators as input")
   1714 else:
-> 1715     return next(val for val in obj if safe_isfinite(val))

StopIteration: 

I have the same issue following this:

_ = ir.pl.group_abundance(
    mdata, groupby="airr:receptor_subtype", target_col="gex:mouse"
)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[52], line 1
----> 1 _ = ir.pl.group_abundance(
      2     mdata, groupby="airr:receptor_subtype", target_col="gex:mouse"
      3 )

File ~\Anaconda3\lib\site-packages\scirpy\pl\_group_abundance.py:98, in group_abundance(adata, groupby, target_col, normalize, max_cols, sort, **kwargs)
     95     default_style_kws.update(kwargs["style_kws"])
     96 kwargs["style_kws"] = default_style_kws
---> 98 return base.bar(abundance, **kwargs)

File ~\Anaconda3\lib\site-packages\scirpy\pl\base.py:65, in bar(data, ax, stacked, style, style_kws, fig_kws, **kwargs)
     63 if "grid" not in kwargs:
     64     kwargs["grid"] = False
---> 65 ax = data.plot.bar(ax=ax, stacked=stacked, **kwargs)
     67 # Remove excess x label
     68 if style_kws is not None:

File ~\Anaconda3\lib\site-packages\pandas\plotting\_core.py:1136, in PlotAccessor.bar(self, x, y, **kwargs)
   1046 @Appender(
   1047     """
   1048     See Also
   (...)
   1125     self, x=None, y=None, **kwargs
   1126 ) -> PlotAccessor:
   1127     """
   1128     Vertical bar plot.
   1129 
   (...)
   1134     other axis represents a measured value.
   1135     """
-> 1136     return self(kind="bar", x=x, y=y, **kwargs)

File ~\Anaconda3\lib\site-packages\pandas\plotting\_core.py:975, in PlotAccessor.__call__(self, *args, **kwargs)
    972             label_name = label_kw or data.columns
    973             data.columns = label_name
--> 975 return plot_backend.plot(data, kind=kind, **kwargs)

File ~\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\__init__.py:71, in plot(data, kind, **kwargs)
     69         kwargs["ax"] = getattr(ax, "left_ax", ax)
     70 plot_obj = PLOT_CLASSES[kind](data, **kwargs)
---> 71 plot_obj.generate()
     72 plot_obj.draw()
     73 return plot_obj.result

File ~\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py:446, in MPLPlot.generate(self)
    444 def generate(self) -> None:
    445     self._args_adjust()
--> 446     self._compute_plot_data()
    447     self._setup_subplots()
    448     self._make_plot()

File ~\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py:632, in MPLPlot._compute_plot_data(self)
    630 # no non-numeric frames or series allowed
    631 if is_empty:
--> 632     raise TypeError("no numeric data to plot")
    634 self.data = numeric_data.apply(self._convert_to_ndarray)

TypeError: no numeric data to plot

Thank you very much for your help

Thanks for reporting this! Something could be wrong with the output of chain_qc. What’s the result of

mdata.obs["airr:receptor_type"].value_counts(dropna=False)

?

I suspect that no chain has been recognized as productive, maybe because scirpy didn’t recognize the locus names. Could you please also check:

ir.get.airr(mdata, ["locus", "productive"])

Thank you very much for your answer. I have the following:

mdata.obs[“airr:receptor_type”].value_counts(dropna=False)
airr:receptor_type
NaN 5954
TCR 210
Name: count, dtype: int64

ir.get.airr(mdata, [“locus”, “productive”])

VJ_1_locus	VJ_1_productive	VDJ_1_locus	VDJ_1_productive	VJ_2_locus	VJ_2_productive	VDJ_2_locus	VDJ_2_productive

cell_id
AAACCTGGTAGTACCT-1 TRA True TRB True None None None None
AAACGGGGTGTGAATA-1 TRA True TRB True None None None None
AAAGATGGTGTGCGTC-1 None None TRB True None None None None
AACACGTGTCGACTGC-1 TRA True TRB True None None None None
AACCGCGTCACCACCT-1 None None TRB True None None None None
… … … … … … … … …
TTGAACGAGGCATTGG-1 TRA True TRB True None None None None
TTGAACGTCAGCTTAG-1 TRA True TRB True None None None None
TTGGCAATCTTTACGT-1 TRA True TRB True None None None None
TTTACTGCAATCTGCA-1 TRA True TRB True TRA True None None
TTTATGCTCTCTAAGG-1 TRA True TRB True None None None None
210 rows × 8 columns

Thank you

Thanks for checking. This doesn’t look suspicious.

Are you sure the cell identifiers match between adata and adata_tcr?

You can check by looking at mdata["gex"].obs_names and mdata["airr"].obs_names and check the overlap for instance with

len(set(mdata["gex"].obs_names) & set(mdata["airr"].obs_names))

I have 0. Do you know how I could fix it?
i have modified the index of adata samples using the following during the single cell analysis:

adata316v2 = sc.read_10x_h5('316v2_sample_filtered_feature_bc_matrix.h5')
adata316v2.obs['mouse'] = 'msh2_ko_antipd1'
adata316v2.obs = adata316v2.obs.reset_index()
adata316v2.obs['index'] = adata316v2.obs['index']+'_'+adata316v2.obs['mouse'].astype(str)
adata316v2.obs = adata316v2.obs.set_index('index')
adata316v2.var_names_make_unique()
adata316v2
adata292v3 = sc.read_10x_h5('292v3_sample_filtered_feature_bc_matrix.h5')
adata292v3.obs['mouse'] = 'wt_antipd1'
adata292v3.obs = adata292v3.obs.reset_index()
adata292v3.obs['index'] = adata292v3.obs['index']+'_'+adata292v3.obs['mouse'].astype(str)
adata292v3.obs = adata292v3.obs.set_index('index')
adata292v3.var_names_make_unique()
adata292v3
adata = sc.concat([adata316v2 ,adata292v3])

Thus, I have different barcodes between adata and adata_tcr after processing the multiplexed samples.

cell_barcodes_adata = adata.obs_names

# Print the first few cell barcodes
print(cell_barcodes_adata[:5])
Index(['AAACCTGAGTATTGGA-1_msh2_ko_antipd1',
       'AAACCTGTCTGTACGA-1_msh2_ko_antipd1',
       'AAACGGGGTGTTTGGT-1_msh2_ko_antipd1',
       'AAAGTAGAGCCTTGAT-1_msh2_ko_antipd1',
       'AAATGCCGTGAGTATA-1_msh2_ko_antipd1'],
      dtype='object', name='index')

cell_barcodes_adata_tcr = adata_tcr.obs_names

# Print the first few cell barcodes
print(cell_barcodes_adata_tcr[:5])
Index(['airr:AAACCTGCAGACGTAG-1', 'airr:AAACCTGTCACCACCT-1',
       'airr:AAACGGGGTCCTCTTG-1', 'airr:AAAGTAGAGAATGTGT-1',
       'airr:AAATGCCAGGATGGTC-1'],
      dtype='object', name='cell_id'

The barcodes need to match. So you need to rename the barcodes of you TCR object the same way as your did in your gene expression object.

Thank you, I will try to find a way to do it. The platform sent me scRNAseq data for each sample but the TCR data were pooled for all samples.

Hi, I rename the barcodes, but now I have the following issue

C:\Users\mestrg01\Anaconda3\lib\site-packages\anndata\_core\anndata.py:1105: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
  if not is_categorical_dtype(df_full[k]):
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[23], line 2
      1 fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(10, 4), gridspec_kw={"wspace": 0.5})
----> 2 mu.pl.embedding(mdata, basis="gex:umap", color=["Cd3e"], ax=ax0, show=False)
      3 mu.pl.embedding(mdata, basis="gex:umap", color=["airr:receptor_type"], ax=ax1)

File ~\Anaconda3\lib\site-packages\muon\_core\plot.py:263, in embedding(data, basis, color, use_raw, layer, **kwargs)
    256             obs = obs.join(
    257                 pd.DataFrame(x, columns=mod_keys, index=fmod_adata.obs_names),
    258                 how="left",
    259             )
    261     color = [mod_key_modifier[k] for k in keys]
--> 263 ad = AnnData(obs=obs, obsm=adata.obsm, obsp=adata.obsp, uns=adata.uns)
    264 retval = sc.pl.embedding(ad, basis=basis_mod, color=color, **kwargs)
    265 for key, col in zip(keys, color):

File ~\Anaconda3\lib\site-packages\anndata\_core\anndata.py:285, in AnnData.__init__(self, X, obs, var, uns, obsm, varm, layers, raw, dtype, shape, filename, filemode, asview, obsp, varp, oidx, vidx)
    283     self._init_as_view(X, oidx, vidx)
    284 else:
--> 285     self._init_as_actual(
    286         X=X,
    287         obs=obs,
    288         var=var,
    289         uns=uns,
    290         obsm=obsm,
    291         varm=varm,
    292         raw=raw,
    293         layers=layers,
    294         dtype=dtype,
    295         shape=shape,
    296         obsp=obsp,
    297         varp=varp,
    298         filename=filename,
    299         filemode=filemode,
    300     )

File ~\Anaconda3\lib\site-packages\anndata\_core\anndata.py:496, in AnnData._init_as_actual(self, X, obs, var, uns, obsm, varm, varp, obsp, raw, layers, dtype, shape, filename, filemode)
    493 self.uns = uns or OrderedDict()
    495 # TODO: Think about consequences of making obsm a group in hdf
--> 496 self._obsm = AxisArrays(self, 0, vals=convert_to_dict(obsm))
    497 self._varm = AxisArrays(self, 1, vals=convert_to_dict(varm))
    499 self._obsp = PairwiseArrays(self, 0, vals=convert_to_dict(obsp))

File ~\Anaconda3\lib\site-packages\anndata\_core\aligned_mapping.py:265, in AxisArrays.__init__(self, parent, axis, vals)
    263 self._data = dict()
    264 if vals is not None:
--> 265     self.update(vals)

File ~\Anaconda3\lib\_collections_abc.py:941, in MutableMapping.update(self, other, **kwds)
    939 if isinstance(other, Mapping):
    940     for key in other:
--> 941         self[key] = other[key]
    942 elif hasattr(other, "keys"):
    943     for key in other.keys():

File ~\Anaconda3\lib\site-packages\anndata\_core\aligned_mapping.py:181, in AlignedActualMixin.__setitem__(self, key, value)
    180 def __setitem__(self, key: str, value: V):
--> 181     value = self._validate_value(value, key)
    182     self._data[key] = value

File ~\Anaconda3\lib\site-packages\anndata\_core\aligned_mapping.py:245, in AxisArraysBase._validate_value(self, val, key)
    236 if (
    237     hasattr(val, "index")
    238     and isinstance(val.index, cabc.Collection)
    239     and not (val.index == self.dim_names).all()
    240 ):
    241     # Could probably also re-order index if it’s contained
    242     raise ValueError(
    243         f"value.index does not match parent’s axis {self.axes[0]} names"
    244     )
--> 245 return super()._validate_value(val, key)

File ~\Anaconda3\lib\site-packages\anndata\_core\aligned_mapping.py:77, in AlignedMapping._validate_value(self, val, key)
     72             raise ValueError(
     73                 f"The AwkwardArray is of variable length in dimension {i}.",
     74                 f"Try ak.to_regular(array, {i}) before including the array in AnnData",
     75             )
     76         else:
---> 77             raise ValueError(
     78                 f"Value passed for key {key!r} is of incorrect shape. "
     79                 f"Values of {self.attrname} must match dimensions "
     80                 f"{self.axes} of parent. Value had shape {actual_shape} while "
     81                 f"it should have had {right_shape}."
     82             )
     84 if not self._allow_df and isinstance(val, pd.DataFrame):
     85     name = self.attrname.title().rstrip("s")

ValueError: Value passed for key 'X_pca' is of incorrect shape. Values of obsm must match dimensions (0,) of parent. Value had shape (5954,) while it should have had (6764,).

Would it be possible to help me to fix it please? Thank you for your help.

Best

I don’t see anything obvious that could be wrong. It might be an issue with MuData.

Could you please provide

  • which version of mudata you are using?
  • what does your MuData object look like (i.e. the output of just mdata in a jupyter cell).

Thank you, here is the mdata output

MuData object with n_obs × n_vars = 5954 × 21259
  2 modalities
    gex:	5954 x 21259
      obs:	'mouse', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
      var:	'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
      uns:	'dendrogram_leiden', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'mouse_colors', 'neighbors', 'pca', 'rank_genes_groups', 'umap'
      obsm:	'X_pca', 'X_umap'
      varm:	'PCs'
      obsp:	'connectivities', 'distances'
    airr:	182 x 0
      obs:	'receptor_type', 'receptor_subtype', 'chain_pairing'
      uns:	'scirpy_version', 'chain_indices'
      obsm:	'airr', 'chain_indices'

Muon (MuData) Version: 0.1.5

Something is odd here… I can’t reproduce your issue with the scirpy example dataset:

>>> import scirpy as ir
>>> import muon as mu
>>> import scanpy as sc
>>> import matplotlib.pyplot as plt

>>> mdata = ir.datasets.wu2020()
>>> mdata
MuData object with n_obs × n_vars = 161015 × 30727
  2 modalities
    gex:	141623 x 30727
      obs:	'cluster_orig', 'patient', 'sample', 'source'
      uns:	'cluster_orig_colors'
      obsm:	'X_umap_orig'
    airr:	123134 x 0
      obs:	'high_confidence', 'is_cell', 'clonotype_orig'
      obsm:	'airr'
>>> len(set(mdata["airr"].obs_names) & set(mdata["gex"].obs_names))
>>> fig, ax0 = plt.subplots()
    mu.pl.embedding(mdata, basis="gex:umap_orig", color=["CD3E"], ax=ax0, show=False)
    plt.show()

Here, as in your example, there are cells that are only in gex, some that are only in airr and some that overlap.

In your error message, it says

Value had shape (5954,) while it should have had (6764,).

Do you have any idea where the 6764 comes from? With 5954 rows, the X_pca seems to match your mdata object after all.


btw, you can format code blocks in your messages
with

```python
code
```

Sorry, I do not know why I had this number. So I restarted the analysis from scratch in the same jupyterlab for the mdata[‘gex’] and it is working now.