Is this currently possible?
This should be possible, though the implementation will be a bit hacky (though the last release cleaned it up a lot!)
To concatenate backed AnnData
objects, you could read in all the annotations (which I’m assuming is fine to read into memory), write that out, then iteratively write the combined X
and layers
arrays.
In the very simple case (datasets are already aligned along var
, no layers
, no raw
, all X
are CSR matrices), this hacky solution could look like:
# Setup
from pathlib import Path
import h5py
from scipy import sparse
import anndata as ad
from anndata._core.sparse_dataset import SparseDataset
from anndata.experimental import read_elem, write_elem
def read_everything_but_X(pth) -> ad.AnnData:
attrs = ["obs", "var", "obsm", "varm", "obsp", "varp", "uns"]
with h5py.File(pth) as f:
adata = ad.AnnData(**{k: read_elem(f[k]) for k in attrs})
return adata
def concat_on_disk(input_pths: list[Path], output_pth: Path):
"""
Params
------
input_pths
Paths to h5ad files which will be concatenated
output_pth
File to write as a result
"""
annotations = ad.concat([read_everything_but_X(pth) for pth in input_pths])
annotations.write_h5ad(output_pth)
n_variables = annotations.shape[1]
del annotations
with h5py.File(out_pth, "a") as target:
dummy_X = sparse.csr_matrix((0, n_variables), dtype="float32")
dummy_X.indptr = dummy_X.indptr.astype("int64") # Guarding against overflow for very large datasets
write_elem(target, "X", dummy_X)
mtx = SparseDataset(target["X"])
for p in pths:
with h5py.File(p, "r") as src:
mtx.append(SparseDataset(src["X"]))
Create test data
from anndata.tests.helpers import gen_adata
def make_adata(*, size=(100, 50), batch_offset: int = 0):
adata = gen_adata(size)
adata.obs_names = [f"cell_{i}" for i in range(batch_offset, batch_offset + size[0])]
return adata
!mkdir -p data
offset = 0
for i in range(10):
adata = make_adata(batch_offset=offset)
del adata.layers
adata.write_h5ad(f"data/adata_{i:02}.h5ad")
offset += adata.shape[0]
Where usage looks like:
pths = sorted(Path("data").glob("*.h5ad"))
concat_on_disk(pths, "result.h5ad")
And it works:
from anndata.tests.helpers import assert_equal
result = ad.read_h5ad("result.h5ad")
in_mem = ad.concat([ad.read_h5ad(p) for p in pths])
in_mem.strings_to_categoricals() # pd.concat turns categoricals to strings for some reason
assert_equal(result, in_mem)
So, the parts are there at the moment – just not exposed for general use. This, of course, would need some work for handling more cases, and could be more efficient.
We’re aiming for AnnData to get a more complete solution for this in the near future (like, this year). Maybe this will be by just having dask handle it.