How to convert R matrix to anndata

Hi,

I want to processed the raw RNAseq data deposited here in as “GSE165371_cb_adult_mouse.tar.gz” GEO Accession viewer

Here a screenshot of the folder content:

As tiy can see it contains two text files and a .matrix file.
I want to use python for further processing, so I need to convert their raw data from .matrix to anndata object. Does anyone know how to do it? I previously converted a Seurat object into AnnData, but I am not sure what to do when starting from a .matrix

Hey @Slack90,

I think scanpy.read_10x_mtx() is what you’re looking for — it will load gene / cell IDs as well when all three files are in one folder.

Thanks for the reply

Yes that seems pretty much what I was looking for, however when I tested it, I got into some issue, I tried to rename the files, so they contain the suffix _matrix:

adata = sc.read_10x_mtx('/home/GSE165371_cb_adult_mouse/', prefix = 'cb_adult_mouse_', cache_compression= 'gzip')

But I get the error:


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[6], line 1
----> 1 adata = sc.read_10x_mtx('/home/slacava3/data_jkebsch1/salvo/Xenopus_levis/code/GSE165371_cb_adult_mouse/', prefix = 'cb_adult_mouse_', cache_compression= 'gzip')

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/legacy_api_wrap/__init__.py:82, in legacy_api.<locals>.wrapper.<locals>.fn_compatible(*args_all, **kw)
     79 @wraps(fn)
     80 def fn_compatible(*args_all: P.args, **kw: P.kwargs) -> R:
     81     if len(args_all) <= n_positional:
---> 82         return fn(*args_all, **kw)
     84     args_pos: P.args
     85     args_pos, args_rest = args_all[:n_positional], args_all[n_positional:]

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/scanpy/readwrite.py:561, in read_10x_mtx(path, var_names, make_unique, cache, cache_compression, gex_only, prefix)
    559 prefix = "" if prefix is None else prefix
    560 is_legacy = (path / f"{prefix}genes.tsv").is_file()
--> 561 adata = _read_10x_mtx(
    562     path,
    563     var_names=var_names,
    564     make_unique=make_unique,
    565     cache=cache,
    566     cache_compression=cache_compression,
    567     prefix=prefix,
    568     is_legacy=is_legacy,
    569 )
    570 if is_legacy or not gex_only:
    571     return adata

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/scanpy/readwrite.py:595, in _read_10x_mtx(path, var_names, make_unique, cache, cache_compression, prefix, is_legacy)
    589 suffix = "" if is_legacy else ".gz"
    590 adata = read(
    591     path / f"{prefix}matrix.mtx{suffix}",
    592     cache=cache,
    593     cache_compression=cache_compression,
    594 ).T  # transpose the data
--> 595 genes = pd.read_csv(
    596     path / f"{prefix}{'genes' if is_legacy else 'features'}.tsv{suffix}",
    597     header=None,
    598     sep="\t",
    599 )
    600 if var_names == "gene_symbols":
    601     var_names_idx = pd.Index(genes[1].values)

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)
   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/io/common.py:765, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    761 if compression == "gzip":
    762     if isinstance(handle, str):
    763         # error: Incompatible types in assignment (expression has type
    764         # "GzipFile", variable has type "Union[str, BaseBuffer]")
--> 765         handle = gzip.GzipFile(  # type: ignore[assignment]
    766             filename=handle,
    767             mode=ioargs.mode,
    768             **compression_args,
    769         )
    770     else:
    771         handle = gzip.GzipFile(
    772             # No overload variant of "GzipFile" matches argument types
    773             # "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
   (...)
    776             **compression_args,
    777         )

File ~/.conda/envs/MyEnv/lib/python3.11/gzip.py:174, in GzipFile.__init__(self, filename, mode, compresslevel, fileobj, mtime)
    172     mode += 'b'
    173 if fileobj is None:
--> 174     fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
    175 if filename is None:
    176     filename = getattr(fileobj, 'name', '')

FileNotFoundError: [Errno 2] No such file or directory: '/home/cb_adult_mouse_features.tsv.gz'

Fistly I had the issue that it could not find “feature” file, so I changed “cb_adult_mouse_genes” to “cb_adult_mouse_features” . But now I have the issue of no .tsv file found (where it should be .csv)

I also tried to convert the .txt files to .tsv.gz, and then rerun sc.read_10_mtx, but I get the following error:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:2606, in pandas._libs.hashtable.Int64HashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:2630, in pandas._libs.hashtable.Int64HashTable.get_item()

KeyError: 1

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[13], line 1
----> 1 adata = sc.read_10x_mtx('/home/slacava3/data_jkebsch1/salvo/Xenopus_levis/code/GSE165371_cb_adult_mouse/', prefix = 'cb_adult_mouse_', cache_compression= 'gzip')

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/legacy_api_wrap/__init__.py:82, in legacy_api.<locals>.wrapper.<locals>.fn_compatible(*args_all, **kw)
     79 @wraps(fn)
     80 def fn_compatible(*args_all: P.args, **kw: P.kwargs) -> R:
     81     if len(args_all) <= n_positional:
---> 82         return fn(*args_all, **kw)
     84     args_pos: P.args
     85     args_pos, args_rest = args_all[:n_positional], args_all[n_positional:]

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/scanpy/readwrite.py:561, in read_10x_mtx(path, var_names, make_unique, cache, cache_compression, gex_only, prefix)
    559 prefix = "" if prefix is None else prefix
    560 is_legacy = (path / f"{prefix}genes.tsv").is_file()
--> 561 adata = _read_10x_mtx(
    562     path,
    563     var_names=var_names,
    564     make_unique=make_unique,
    565     cache=cache,
    566     cache_compression=cache_compression,
    567     prefix=prefix,
    568     is_legacy=is_legacy,
    569 )
    570 if is_legacy or not gex_only:
    571     return adata

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/scanpy/readwrite.py:601, in _read_10x_mtx(path, var_names, make_unique, cache, cache_compression, prefix, is_legacy)
    595 genes = pd.read_csv(
    596     path / f"{prefix}{'genes' if is_legacy else 'features'}.tsv{suffix}",
    597     header=None,
    598     sep="\t",
    599 )
    600 if var_names == "gene_symbols":
--> 601     var_names_idx = pd.Index(genes[1].values)
    602     if make_unique:
    603         var_names_idx = anndata.utils.make_index_unique(var_names_idx)

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/core/frame.py:4102, in DataFrame.__getitem__(self, key)
   4100 if self.columns.nlevels > 1:
   4101     return self._getitem_multilevel(key)
-> 4102 indexer = self.columns.get_loc(key)
   4103 if is_integer(indexer):
   4104     indexer = [indexer]

File ~/.conda/envs/MyEnv/lib/python3.11/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key)
   3807     if isinstance(casted_key, slice) or (
   3808         isinstance(casted_key, abc.Iterable)
   3809         and any(isinstance(x, slice) for x in casted_key)
   3810     ):
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.
   3817     self._check_indexing_error(key)

KeyError: 1

Hey @Slack90,

where it should be .csv

The default format for the cell barcode and feature information files is .tsv.gz as specified in the CellRanger documentation.

You can always read the .mtx file and metadata independently to construct an AnnData object with AnnData(X=X, obs=cell_barcodes, var=features):

from scipy.io import mmread
import pandas as pd

X = mmread("cb_adult_mouse.mtx")
cell_barcodes = pd.read_csv("cb_adult_mouse_barcodes.tsv.gz", compression="gzip", sep="\t")
features = pd.read_csv("cb_adult_mouse_genes.tsv.gz", compression="gzip", sep="\t")