Hello,
Thanks a lot for this great tool.
I’m running the following analysis:
control = sc.read_h5ad(control_dir) #When you want to load after SOLO you need to use h5ad load instead of h5
control.var_names_make_unique()
ove = sc.read_h5ad(ove_dir)
ove.var_names_make_unique()
healthy = sc.read_h5ad(healthy_dir)
healthy.var_names_make_unique()
adata = control.concatenate(ove, healthy, batch_key="Sample", batch_categories=["control", "ove", "healthy"])
print(adata.obs['Sample'])
# mitochondrial genes
adata.var["mt"] = adata.var_names.str.startswith("mt-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("rps", "rps"))
# hemoglobin genes.
adata.var["hb"] = adata.var_names.str.contains(("^hb[^(p)]"))
sc.pp.calculate_qc_metrics(
adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)
#normalize data that can be used for other scanpy functions, etc.
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
adata.raw = adata
sc.pp.highly_variable_genes(adata, n_top_genes=30454, subset = True, layer = 'counts', flavor = "seurat_v3", batch_key="Sample")
#Here I use all genes, so I can further perform DEGs on all genes
scvi.model.SCVI.setup_anndata(adata, layer='counts',
categorical_covariate_keys=["Sample"],
continuous_covariate_keys=['pct_counts_mt', 'pct_counts_ribo'])
model = scvi.model.SCVI(adata)
model.train(check_val_every_n_epoch=5, max_epochs=300, early_stopping=True, early_stopping_patience=20, early_stopping_monitor='elbo_validation')
latent = model.get_latent_representation() #this is what you will use to cluster now instead of PCs like normal
latent.shape
adata.obsm['X_scVI'] = latent
adata.layers['scvi_normalized'] = model.get_normalized_expression(library_size = 1e4)
sc.pp.neighbors(adata, use_rep = 'X_scVI')
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution = 0.4)
genes = model.differential_expression(idx1 = [(adata.obs.Sample == 'control') & (adata.obs.leiden == '0')],
idx2 = [(adata.obs.Sample == 'healthy') & (adata.obs.leiden == '0')], **batch_correction=True**)
I’m not completely sure if I should be using batch_correction=True when performing differential_expression() or not? I would be really happy to hear your opinion on this?
Thanks a lot!