Hello,
This is a minimal example of my analysis pipeline:
mdata = mu.read_10x_h5(os.path.join(data_dir, "filtered_feature_bc_matrix.h5"))
mdata.var_names_make_unique()
rna = mdata.mod['rna']
atac = mdata.mod['atac']
# RNA
rna.var['mt'] = rna.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(rna, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
mu.pp.filter_var(rna, 'n_cells_by_counts', lambda x: x >= 10)
mu.pp.filter_obs(rna, 'n_genes_by_counts', lambda x: (x >= 500) & (x < 5000))
mu.pp.filter_obs(rna, 'total_counts', lambda x: x < 15000)
mu.pp.filter_obs(rna, 'pct_counts_mt', lambda x: x < 20)
sc.pp.scrublet(rna)
doublets = sum(rna.obs.predicted_doublet)
mu.pp.filter_obs(rna, 'predicted_doublet', lambda x: x==False)
rna.layers['raw_counts'] = rna.X
sc.pp.normalize_total(rna, target_sum=1e4) #1e6 to do CPM normalization
sc.pp.log1p(rna)
rna.layers['lognorm_counts'] = rna.X
sc.pp.highly_variable_genes(rna, min_mean=0.02, max_mean=4, min_disp=0.5)
sc.pp.scale(rna, max_value=10)
rna.layers['scaled_counts'] = rna.X
rna.X = rna.layers['raw_counts'] # Restore original raw counts before Pearson residual normalization
analytic_pearson = sc.experimental.pp.normalize_pearson_residuals(rna, inplace=False)
rna.layers['analytic_pearson_residuals'] = csr_matrix(analytic_pearson['X'])
# ATAC
sc.pp.scrublet(atac)
doublets = sum(atac.obs.predicted_doublet)
mu.pp.filter_obs(atac, 'predicted_doublet', lambda x: x==False)
sc.pp.calculate_qc_metrics(atac, percent_top=None, log1p=False, inplace=True)
# Rename columns to make them fit to scATAC-seq (scanpy labels the metrics according to RNA-seq)
atac.obs.rename(
columns={
'n_genes_by_counts': 'n_features_per_cell',
'total_counts': 'total_fragment_counts'
},
inplace=True
)
atac.obs['log_total_fragment_counts'] = np.log10(atac.obs['total_fragment_counts'])
ac.tl.nucleosome_signal(atac)
nuc_signal_threshold = 2
atac.obs['nuc_signal_filter'] = [
'NS_FAIL' if ns > nuc_signal_threshold else 'NS_PASS'
for ns in atac.obs['nucleosome_signal']
]
ac.tl.get_gene_annotation_from_rna(mdata['rna']).head(3)
tss = ac.tl.tss_enrichment(mdata, n_tss=2000)
tss_threshold = 1.5
tss.obs["tss_filter"] = [
"TSS_FAIL" if score < tss_threshold else "TSS_PASS"
for score in atac.obs["tss_score"]
]
mu.pp.filter_obs(
atac,
"total_fragment_counts",
lambda x: (x >= 5000) & (x <= 45000),
)
mu.pp.filter_obs(
atac,
"n_features_per_cell",
lambda x: x >= 2200
)
mu.pp.filter_obs(
atac,
"tss_score",
lambda x: (x >= 2.0) & (x <= 20),
)
mu.pp.filter_obs(
atac,
"nucleosome_signal",
lambda x: x <= 2
)
mu.pp.filter_var(atac, "n_cells_by_counts", lambda x: x >= 10)
atac.layers["raw_counts"] = atac.X
ac.pp.tfidf(atac, scale_factor=1e4)
atac.layers['tf-idf_norm_counts'] = atac.X
atac.X = atac.layers["raw_counts"]
sc.pp.normalize_per_cell(atac, counts_per_cell_after=1e4)
sc.pp.log1p(atac)
atac.layers['lognorm_counts'] = atac.X
atac.X = atac.layers['tf-idf_norm_counts']
sc.pp.highly_variable_genes(atac, min_mean=0.05, max_mean=1.5, min_disp=0.5)
mu.pp.intersect_obs(mdata)
Thanks,
Leandro