I have a set of single-cell libraries from an drug treatment experiment - early timepoint, treatment/DMSO at 3 timepoints (21 libraries total).
I have preprocessed each library separately with Seurat using SCTransform v2 additionally regressing out cell cycle, MT genes and Ribosomal genes. I now have a list object of normalised and transformed libraries.
I want to run integration as per Integrating datasets with scVI. I have a few questions regarding methodology:
-
Is it OK to use SCTransform-ed counts or should I still feed in log-normalised counts from NormalizeData() instead?
-
How should I select the HVG for feeding into the model? Currently my approach is to take all unique genes from the set of HVG defined for all of my libraries in the list (21 sets of slightly differet HVG).
-
When I select HVG, I manually filter out genes that would confound the analysis i.e:
rm.genes <- c(grep("^MT-", rownames(x), value = TRUE),
grep("^M?RP[SL]", rownames(x), value = TRUE),
grep("[\\.]", rownames(x), value = TRUE),
grep("^LINC", rownames(x), value = TRUE),
c("MALAT1", "CRYAB"))
Is this appropriate?
Full preprocessing code below:
## import raw matrices
seurat.list <- lapply(samplesheet$Directory, \(x){
message("\033[92;1;m Doing: ", x, " \033[0m")
file.dir <- paste0("/mnt/rds/DMP/UCEC/GLIOTEAM/10x_single_cell_DATA/cellranger_results/CloneSifter/",
x, "/outs/filtered_feature_bc_matrix/")
bcm <- Read10X(file.dir, unique.features = TRUE, strip.suffix = TRUE)
x.seurat <- CreateSeuratObject(counts = bcm,
project = x,
min.cells = 5,
min.features = 0)
x.seurat <- AddMetaData(x.seurat, metadata = x, col.name = "dataset")
x.seurat <- AddMetaData(x.seurat, metadata = Cells(x.seurat), col.name = "cellnames")
x.seurat <- RenameCells(x.seurat,
new.names = paste(x, 1:length(Cells(x.seurat)), sep = "_"))
require(EnsDb.Hsapiens.v86)
ens.86.genes <- genes(EnsDb.Hsapiens.v86)
linc.genes <- ens.86.genes$symbol[ens.86.genes$gene_biotype=="lincRNA"]
cust.set <- Features(x.seurat)[Features(x.seurat) %in% linc.genes]
x.seurat <- get.percent.qc(x.seurat, custom = cust.set, custom.name = "plincRNA_RNA")
return(x.seurat)
})
## fix list names, subset to common features
names(seurat.list) <- samplesheet$Directory
seaurat.features <- lapply(seurat.list, Features)
common.features <- Reduce(intersect, seaurat.features)
length(common.features)
seurat.list <- lapply(seurat.list, \(x) subset(x, features = common.features))
seurat.list <- lapply(seurat.list, \(x) {
x <- get.subset(x, n.feat.l = 1000, n.feat.u = 8000, p.mito = 10, p.ribo = 40)
subset(x, DoubletFinder == "Singlet")
})
## cell cycle scoring
seurat.list <- lapply(seurat.list, \(x){
x <- NormalizeData(x)
x <- CellCycleScoring(object = x,
g2m.features = cc.genes.updated.2019$g2m.genes,
s.features = cc.genes.updated.2019$s.genes)
x <- AddMetaData(x,
metadata = x$S.Score - x$G2M.Score,
col.name = "Cycle.Score")
x
})
options(future.globals.maxSize = 2048*1024^2)
gc()
## sctransform normalisation
seurat.list <- lapply(seurat.list, \(x){
DefaultAssay(x) <- "RNA"
x <- SCTransform(x,
method = "glmGamPoi",
vst.flavor = "v2",
verbose = TRUE,
do.scale = TRUE,
do.center = TRUE,
vars.to.regress = c("Cycle.Score", "pMitochondrial_RNA", "pRibosomal_RNA"),
variable.features.n = 2000,
new.assay.name = "SCT")
rm.genes <- c(grep("^MT-", rownames(x), value = TRUE),
grep("^M?RP[SL]", rownames(x), value = TRUE),
grep("[\\.]", rownames(x), value = TRUE),
grep("^LINC", rownames(x), value = TRUE),
c("MALAT1", "CRYAB"))
var.genes <- x@assays$SCT@var.features[which(!x@assays$SCT@var.features %in% rm.genes)]
VariableFeatures(x[["SCT"]]) <- var.genes
x
})