How to convert R matrix to anndata

Slack90 · January 15, 2025, 10:55pm

Hi,

I want to processed the raw RNAseq data deposited here in as “GSE165371_cb_adult_mouse.tar.gz” GEO Accession viewer

Here a screenshot of the folder content:

As tiy can see it contains two text files and a .matrix file.
I want to use python for further processing, so I need to convert their raw data from .matrix to anndata object. Does anyone know how to do it? I previously converted a Seurat object into AnnData, but I am not sure what to do when starting from a .matrix

gtca · January 15, 2025, 11:37pm

Hey @Slack90,

I think scanpy.read_10x_mtx() is what you’re looking for — it will load gene / cell IDs as well when all three files are in one folder.

Slack90 · January 16, 2025, 3:13pm

Thanks for the reply

Yes that seems pretty much what I was looking for, however when I tested it, I got into some issue, I tried to rename the files, so they contain the suffix _matrix:

adata = sc.read_10x_mtx('/home/GSE165371_cb_adult_mouse/', prefix = 'cb_adult_mouse_', cache_compression= 'gzip')

But I get the error:


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[6], line 1
----> 1 adata = sc.read_10x_mtx('/home/slacava3/data_jkebsch1/salvo/Xenopus_levis/code/GSE165371_cb_adult_mouse/', prefix = 'cb_adult_mouse_', cache_compression= 'gzip')

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/legacy_api_wrap/__init__.py:82, in legacy_api.<locals>.wrapper.<locals>.fn_compatible(*args_all, **kw)
     79 @wraps(fn)
     80 def fn_compatible(*args_all: P.args, **kw: P.kwargs) -> R:
     81     if len(args_all) <= n_positional:
---> 82         return fn(*args_all, **kw)
     84     args_pos: P.args
     85     args_pos, args_rest = args_all[:n_positional], args_all[n_positional:]

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/scanpy/readwrite.py:561, in read_10x_mtx(path, var_names, make_unique, cache, cache_compression, gex_only, prefix)
    559 prefix = "" if prefix is None else prefix
    560 is_legacy = (path / f"{prefix}genes.tsv").is_file()
--> 561 adata = _read_10x_mtx(
    562     path,
    563     var_names=var_names,
    564     make_unique=make_unique,
    565     cache=cache,
    566     cache_compression=cache_compression,
    567     prefix=prefix,
    568     is_legacy=is_legacy,
    569 )
    570 if is_legacy or not gex_only:
    571     return adata

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/scanpy/readwrite.py:595, in _read_10x_mtx(path, var_names, make_unique, cache, cache_compression, prefix, is_legacy)
    589 suffix = "" if is_legacy else ".gz"
    590 adata = read(
    591     path / f"{prefix}matrix.mtx{suffix}",
    592     cache=cache,
    593     cache_compression=cache_compression,
    594 ).T  # transpose the data
--> 595 genes = pd.read_csv(
    596     path / f"{prefix}{'genes' if is_legacy else 'features'}.tsv{suffix}",
    597     header=None,
    598     sep="\t",
    599 )
    600 if var_names == "gene_symbols":
    601     var_names_idx = pd.Index(genes[1].values)

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)
   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/io/common.py:765, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    761 if compression == "gzip":
    762     if isinstance(handle, str):
    763         # error: Incompatible types in assignment (expression has type
    764         # "GzipFile", variable has type "Union[str, BaseBuffer]")
--> 765         handle = gzip.GzipFile(  # type: ignore[assignment]
    766             filename=handle,
    767             mode=ioargs.mode,
    768             **compression_args,
    769         )
    770     else:
    771         handle = gzip.GzipFile(
    772             # No overload variant of "GzipFile" matches argument types
    773             # "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
   (...)
    776             **compression_args,
    777         )

File ~/.conda/envs/MyEnv/lib/python3.11/gzip.py:174, in GzipFile.__init__(self, filename, mode, compresslevel, fileobj, mtime)
    172     mode += 'b'
    173 if fileobj is None:
--> 174     fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
    175 if filename is None:
    176     filename = getattr(fileobj, 'name', '')

FileNotFoundError: [Errno 2] No such file or directory: '/home/cb_adult_mouse_features.tsv.gz'

Fistly I had the issue that it could not find “feature” file, so I changed “cb_adult_mouse_genes” to “cb_adult_mouse_features” . But now I have the issue of no .tsv file found (where it should be .csv)

Slack90 · January 16, 2025, 4:09pm

I also tried to convert the .txt files to .tsv.gz, and then rerun sc.read_10_mtx, but I get the following error:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:2606, in pandas._libs.hashtable.Int64HashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:2630, in pandas._libs.hashtable.Int64HashTable.get_item()

KeyError: 1

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[13], line 1
----> 1 adata = sc.read_10x_mtx('/home/slacava3/data_jkebsch1/salvo/Xenopus_levis/code/GSE165371_cb_adult_mouse/', prefix = 'cb_adult_mouse_', cache_compression= 'gzip')

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/legacy_api_wrap/__init__.py:82, in legacy_api.<locals>.wrapper.<locals>.fn_compatible(*args_all, **kw)
     79 @wraps(fn)
     80 def fn_compatible(*args_all: P.args, **kw: P.kwargs) -> R:
     81     if len(args_all) <= n_positional:
---> 82         return fn(*args_all, **kw)
     84     args_pos: P.args
     85     args_pos, args_rest = args_all[:n_positional], args_all[n_positional:]

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/scanpy/readwrite.py:561, in read_10x_mtx(path, var_names, make_unique, cache, cache_compression, gex_only, prefix)
    559 prefix = "" if prefix is None else prefix
    560 is_legacy = (path / f"{prefix}genes.tsv").is_file()
--> 561 adata = _read_10x_mtx(
    562     path,
    563     var_names=var_names,
    564     make_unique=make_unique,
    565     cache=cache,
    566     cache_compression=cache_compression,
    567     prefix=prefix,
    568     is_legacy=is_legacy,
    569 )
    570 if is_legacy or not gex_only:
    571     return adata

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/scanpy/readwrite.py:601, in _read_10x_mtx(path, var_names, make_unique, cache, cache_compression, prefix, is_legacy)
    595 genes = pd.read_csv(
    596     path / f"{prefix}{'genes' if is_legacy else 'features'}.tsv{suffix}",
    597     header=None,
    598     sep="\t",
    599 )
    600 if var_names == "gene_symbols":
--> 601     var_names_idx = pd.Index(genes[1].values)
    602     if make_unique:
    603         var_names_idx = anndata.utils.make_index_unique(var_names_idx)

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/core/frame.py:4102, in DataFrame.__getitem__(self, key)
   4100 if self.columns.nlevels > 1:
   4101     return self._getitem_multilevel(key)
-> 4102 indexer = self.columns.get_loc(key)
   4103 if is_integer(indexer):
   4104     indexer = [indexer]

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key)
   3807     if isinstance(casted_key, slice) or (
   3808         isinstance(casted_key, abc.Iterable)
   3809         and any(isinstance(x, slice) for x in casted_key)
   3810     ):
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.
   3817     self._check_indexing_error(key)

KeyError: 1

gtca · January 17, 2025, 8:27pm

Hey @Slack90,

where it should be .csv

The default format for the cell barcode and feature information files is .tsv.gz as specified in the CellRanger documentation.

You can always read the .mtx file and metadata independently to construct an AnnData object with AnnData(X=X, obs=cell_barcodes, var=features):

from scipy.io import mmread
import pandas as pd

X = mmread("cb_adult_mouse.mtx")
cell_barcodes = pd.read_csv("cb_adult_mouse_barcodes.tsv.gz", compression="gzip", sep="\t")
features = pd.read_csv("cb_adult_mouse_genes.tsv.gz", compression="gzip", sep="\t")

Topic		Replies	Views
Convert Scanpy (h5ad) to Seurat (rds) anndata	9	15937	September 27, 2024
Reading matrix.mtx with "real" number format scanpy	0	358	October 29, 2023
How to load GEO datasets for analysis using Scanpy / Scvi tools? scRNA-seq	6	1992	November 6, 2024
Read_10x_mtx error UnicodeDecodeError: scanpy	1	630	March 16, 2023
Error reading HDF5 file in MuData/AnnData anndata	2	65	October 6, 2024

How to convert R matrix to anndata

Related topics