Merged adata with different dimension but getting stuck at PCA

I have 4 files which are slightly different in dimensions, the var is the same the obs is different. When I tried to run PCA it keeps showing error message “ValueError: Input contains NaN, infinity or a value too large for dtype(‘float32’).”

This is what I wrote to fun scanpy pearson residual:

import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import anndata as ad

sc.settings.verbosity =3 # verbosity: erros (0), warnings (1), infor (2), hints(3)
sc.logging.print_header()
sc.settings. set_figure_params(dpi=80, facecolor=‘white’)

adata_pbmc_normal_L001 = sc.read_10x_mtx(‘~/pbmc_normal_L001/outs/filtered_feature_bc_matrix/’)
adata_pbmc_normal_L002 = sc.read_10x_mtx(‘~/pbmc_normal_L002/outs/filtered_feature_bc_matrix/’)
adata_pbmc_normal_L003 = sc.read_10x_mtx(‘~/pbmc_normal_L003/outs/filtered_feature_bc_matrix/’)
adata_pbmc_normal_L004 = sc.read_10x_mtx(‘~/pbmc_normal_L004/outs/filtered_feature_bc_matrix/’)

adata_pbmc_normal_L001.uns[“name”] =“pbmc_L001”
adata_pbmc_normal_L002.uns[“name”] =“pbmc_L002”
adata_pbmc_normal_L003.uns[“name”] =“pbmc_L003”
adata_pbmc_normal_L004.uns[“name”] =“pbmc_L004”

adata1 = adata_pbmc_normal_L001.copy()
adata2 = adata_pbmc_normal_L002.copy()
adata3 = adata_pbmc_normal_L003.copy()
adata4 = adata_pbmc_normal_L004.copy()

adata = ad.concat([adata1, adata2, adata3, adata4], join=“outer”, fill_value=np.nan)

#Remove Empty Cells
adata.var_names_make_unique()
sc.pp.filter_genes(adata, min_cells=1)
print()

Remove Doublets

sc.external.pp.scrublet(adata)

marker genes from table in pbmc3k tutorial

markers = [
“IL7R”,
“LYZ”,
“CD14”,
“MS4A1”,
“CD8A”,
“GNLY”,
“NKG7”,
“FCGR3A”,
“MS4A7”,
“CST3”,
“PPBP”,
]

adata.var[“mt”] = adata.var_names.str.startswith(“MT-”)
sc.pp.calculate_qc_metrics(
adata, qc_vars=[“mt”], percent_top=None, log1p=False, inplace=True
)

sc.pl.violin(
adata,
[“n_genes_by_counts”, “total_counts”, “pct_counts_mt”],
jitter=0.4,
multi_panel=True,
)

Pearson Residuals for Highly Variable Gene

sc.experimental.pp.highly_variable_genes(
adata, flavor=“pearson_residuals”, n_top_genes=2000
)

fig, axes = plt.subplots(1, 2, figsize=(12, 6))
for ax, adata in zip(axes, adata):

hvgs = adata.var["highly_variable"]

ax.scatter(
    adata.var["mean_counts"], adata.var["residual_variances"], s=3, edgecolor="none"
)

ax.scatter(
    adata.var["mean_counts"][hvgs],
    adata.var["residual_variances"][hvgs],
    c="tab:red",
    label="selected genes",
    s=3,
    edgecolor="none",
)

ax.scatter(
    adata.var["mean_counts"][np.isin(adata.var_names, markers)],
    adata.var["residual_variances"][np.isin(adata.var_names, markers)],
    c="k",
    label="known marker genes",
    s=10,
    edgecolor="none",
)

ax.set_xscale("log")
ax.set_xlabel("mean expression")
ax.set_yscale("log")
ax.set_ylabel("residual variance")
ax.set_title("No Filter PBMC: ")

ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
ax.yaxis.set_ticks_position("left")
ax.xaxis.set_ticks_position("bottom")

plt.legend()

Apply gene selection

pbmc_normal = adata[:, adata.var[“highly_variable”]]

pbmc_normal

Preparations

#keep raw and depth-normalized counts for later
adata.layers[“raw”] = adata.X.copy()
adata.layers[“sqrt_norm”] = np.sqrt(
sc.pp.normalize_total(adata, inplace=False)[“X”]
)

sc.experimental.pp.normalize_pearson_residuals(adata)

sc.pp.pca(adata, n_comps=50)
n_cells = len(adata)
sc.tl.tsne(adata, use_rep=“X_pca”)

sc.pp.neighbors(adata, n_neighbors=10, n_pcs=50)
sc.tl.leiden(adata)

sc.pl.tsne(adata, color=[“leiden”], cmap=“tab20”)
sc.pl.tsne(adata, color=markers, layer=“sqrt_norm”)