Chapter 2 scRNA-seq

2.1 Preprocessing

#!/bin/bash
#SBATCH -p CPU # partition (queue)
#SBATCH --job-name=NSC
#SBATCH -n 40
#SBATCH -t 7-00:00 # time (D-HH:MM)
#SBATCH --array=1-4
#SBATCH -o _log/scrna.%A_%a.out # STDOUT
#SBATCH -e _log/scrna.%A_%a.err # STDERR
#SBATCH --exclude=node30
#SBATCH --mail-type=END,FAIL # notifications for job done & fail
#SBATCH --mail-user=809449456@qq.com # send-to address

id=`sed -n ${SLURM_ARRAY_TASK_ID}p sample.txt`
echo "${id}"

fq_path=.
index_path=refdata-gex-GRCh38-2020-A

$CELLRANGER_6 count --id=${id} \
  --fastqs=${fq_path} \
  --sample=${id} \
  --transcriptome=${index_path}

2.2 Load packages

suppressMessages({
library(Seurat)
library(dplyr)
library(Matrix)
library(gplots)
library(matrixStats)
library(sva)
library(ggpubr)
library(openxlsx)
library(stringr)
library(scran)
library(ggthemes)
library(ggthemes)
library(destiny)
library(grDevices)
library(reticulate)
library(L1Graph)
library(Biobase)
library(scatterplot3d)
library(monocle)
library(pheatmap)
library(harmony)
library(SingleR)
library(Signac)
library(dplyr)
})

color.lib <- c("#A6761D", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", 
               "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A", "#F4B3BE",
               "#1B9E77", "#D95F02", "#7570B3", "#E7298A", "#66A61E", "#E6AB02", 
               "#F4A11D", "#8DC8ED", "#4C6CB0", "#8A1C1B", "#CBCC2B", "#EA644C",
               "#634795", "#005B1D", "#26418A", "#CB8A93", "#F1E404", "#E22826",
               "#50C1FF", "#F4D31D", "#F4A11D", "#82C800", "#8B5900", "#858ED1",
               "#FF72E1", "#CB50B2", "#007D9B", "#26418A", "#8B495F", "#FF394B")

2.3 Analysis of scRNA-seq data

out.path <- "out.path"

############ analysis
p53 <- CreateSeuratObject(counts = p53.data, project = "p53", min.cells = 0, min.features = 0)
p53[["percent.mt"]] <- PercentageFeatureSet(p53, pattern = "^MT-")
p53
summary(p53@meta.data$percent.mt)

p53@meta.data$Sample = sample.name



# Visualize QC metrics as a violin plot
pdf(paste0(out.path, "/1.vlnplot.pdf"), width = 12, height = 7)
VlnPlot(p53, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
dev.off()

pdf(paste0(out.path, "/1.geneplot.pdf"), width = 12, height = 7)
plot1 <- FeatureScatter(p53, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(p53, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
dev.off()



p53 <- subset(p53, subset = nFeature_RNA > 200 & percent.mt < 20)
p53



pdf(paste0(out.path, "/2.filter.vlnplot.pdf"), width = 12, height = 7)
VlnPlot(p53, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
dev.off()

pdf(paste0(out.path, "/2.filter.geneplot.pdf"), width = 12, height = 7)
plot1 <- FeatureScatter(p53, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(p53, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
dev.off()

p53 <- NormalizeData(p53, normalization.method = "LogNormalize", scale.factor = 10000)
p53 <- FindVariableFeatures(p53, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(p53), 10)

# plot variable features with and without labels
pdf(paste0(out.path, "/3.VariableFeaturePlot.pdf"), width = 12, height = 7)
plot1 <- VariableFeaturePlot(p53)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2
dev.off()

all.genes <- rownames(p53)
p53 <- ScaleData(p53, features = VariableFeatures(p53))
p53 <- RunPCA(p53, features = VariableFeatures(object = p53))

p <- DimPlot(p53, reduction = "pca") + theme_few()
ggsave(paste0(out.path, "/4.PCA.pdf"), p, width = 9, height = 7)


p53 <- FindNeighbors(p53, reduction = "pca", dims = 1:30)
p53 <- FindClusters(p53, resolution = 0.8)

p53 <- RunTSNE(p53, reduction = "pca", dims = 1:30, perplexity = 50)
p53 <- RunUMAP(p53, reduction = "pca", dims = 1:30)

2.4 Visualization of scRNA-seq data

p <- DimPlot(p53, reduction = "umap", pt.size = 1, 
             cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/5.UMAP.cluster.pdf"), p, width = 9, height = 7)

p <- DimPlot(p53, reduction = "umap", pt.size = 1, label = TRUE, label.size = 10,
             cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/5.UMAP.cluster.label.pdf"), p, width = 9, height = 7)


p <- DimPlot(p53, reduction = "tsne", pt.size = 1, 
             cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/6.tSNE.cluster.pdf"), p, width = 9, height = 7)

p <- DimPlot(p53, reduction = "tsne", pt.size = 1, label = TRUE, label.size = 10,
             cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/6.tSNE.cluster.label.pdf"), p, width = 9, height = 7)



############### Cell cycle
cc.genes

p53 <- CellCycleScoring(p53, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes)

p <- DimPlot(p53, reduction = "umap", pt.size = 1, label = FALSE, 
             group.by = "Phase", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/7.CC.phase.pdf"), p, width = 9, height = 7)

p <- VlnPlot(p53, features = "S.Score", pt.size = 0, 
             group.by = "seurat_clusters", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/7.CC.score.S.Score.pdf"), p, width = 10, height = 5)

p <- VlnPlot(p53, features = "G2M.Score", pt.size = 0, 
             group.by = "seurat_clusters", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/7.CC.score.G2M.Score.pdf"), p, width = 10, height = 5)





####################### SingleR annotation
ref <- readRDS(file = "hs.BlueprintEncodeData.RDS")
pred.BlueprintEncodeData <- SingleR(test = p53@assays$RNA@data, ref = ref, labels = ref$label.main)

ref <- readRDS(file = "hs.HumanPrimaryCellAtlasData.RDS")
pred.HumanPrimaryCellAtlasData <- SingleR(test = p53@assays$RNA@data, ref = ref, labels = ref$label.main)

ref <- readRDS(file = "NovershternHematopoieticData.RDS")
pred.NovershternHematopoieticData <- SingleR(test = p53@assays$RNA@data, ref = ref, labels = ref$label.main)


p53@meta.data$CellType.BlueprintEncodeData <- pred.BlueprintEncodeData$labels
p53@meta.data$CellType.HumanPrimaryCellAtlasData <- pred.HumanPrimaryCellAtlasData$labels
p53@meta.data$CellType.NovershternHematopoieticData <- pred.NovershternHematopoieticData$labels



p <- DimPlot(p53, reduction = "umap", pt.size = 1, label = TRUE, label.size = 4,
             group.by = "CellType.BlueprintEncodeData", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/8.SingleR.umap.BlueprintEncodeData.pdf"), p, width = 9, height = 7)

p <- DimPlot(p53, reduction = "umap", pt.size = 1, label = TRUE, label.size = 4,
             group.by = "CellType.HumanPrimaryCellAtlasData", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/8.SingleR.umap.HumanPrimaryCellAtlasData.pdf"), p, width = 9, height = 7)

p <- DimPlot(p53, reduction = "umap", pt.size = 1, label = TRUE, label.size = 4,
             group.by = "CellType.NovershternHematopoieticData", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/8.SingleR.umap.NovershternHematopoieticData.pdf"), p, width = 9, height = 7)


p <- DimPlot(p53, reduction = "tsne", pt.size = 1, label = TRUE, label.size = 4,
             group.by = "CellType.BlueprintEncodeData", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/8.SingleR.tsne.BlueprintEncodeData.pdf"), p, width = 9, height = 7)

p <- DimPlot(p53, reduction = "tsne", pt.size = 1, label = TRUE, label.size = 4,
             group.by = "CellType.HumanPrimaryCellAtlasData", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/8.SingleR.tsne.HumanPrimaryCellAtlasData.pdf"), p, width = 9, height = 7)

p <- DimPlot(p53, reduction = "tsne", pt.size = 1, label = TRUE, label.size = 4,
             group.by = "CellType.NovershternHematopoieticData", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/8.SingleR.tsne.NovershternHematopoieticData.pdf"), p, width = 9, height = 7)

2.5 QC for droplets

library(parallel)
cl <- makeCluster(getOption("cl.cores", 4 ))


markers.par <- parLapply(cl, sample.list, function(x) {

  obj.raw <- readRDS(paste0("obj/", x, ".rds"))
  demux <- DropletUtils::hashedDrops( obj.raw@assays$RNA@counts )
  demux$Cell <- rownames( obj.raw@meta.data )
  openxlsx::write.xlsx(demux, paste0("obj/", x, "_demux.xlsx"), rowNames = T, overwrite = T  )
})

2.6 Merge scRNA-seq data

################ merge expression profile
exp.data <- NULL
meta.data <- NULL
for (kkk in 1:length(sample.list)) {
message(kkk, "   ", sample.list[kkk])

obj.raw <- readRDS(paste0("obj/", sample.list[kkk], ".rds"))
demux <- read.xlsx(paste0("obj/", sample.list[kkk], "_demux.xlsx"))

obj.raw@meta.data$Doublet <- demux$Doublet[match(rownames(obj.raw@meta.data), demux$Cell)]


exp.data <- cbind(exp.data, obj.raw@assays$RNA@counts )
meta.data <- rbind(meta.data, obj.raw@meta.data[, c("Sample","Doublet","CellType.BlueprintEncodeData","CellType.HumanPrimaryCellAtlasData","CellType.NovershternHematopoieticData")] )

}


############ analysis
p53 <- CreateSeuratObject(counts = exp.data, project = "p53", min.cells = 0, min.features = 0)
p53[["percent.mt"]] <- PercentageFeatureSet(p53, pattern = "^MT-")
p53
summary(p53@meta.data$percent.mt)


p53@meta.data <- cbind(p53@meta.data, meta.data[, 1:5])


p53 <- subset(p53, subset = nFeature_RNA > 500 & percent.mt < 20 & nCount_RNA < 30000 & Doublet == FALSE)


############ analysis
pdf(paste0(out.path, "/2.filter.vlnplot.pdf"), width = 12, height = 7)
VlnPlot(p53, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), pt.size = 0, group.by = "Sample", ncol = 3)
dev.off()


pdf(paste0(out.path, "/2.filter.geneplot.pdf"), width = 12, height = 7)
plot1 <- FeatureScatter(p53, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(p53, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
dev.off()

p53 <- NormalizeData(p53, normalization.method = "LogNormalize", scale.factor = ncol(p53) )


p53 <- FindVariableFeatures(p53, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(p53), 10)

# plot variable features with and without labels
pdf(paste0(out.path, "/3.VariableFeaturePlot.pdf"), width = 12, height = 7)
plot1 <- VariableFeaturePlot(p53)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2
dev.off()

all.genes <- rownames(p53)
p53 <- ScaleData(p53, features = VariableFeatures(p53))
p53 <- RunPCA(p53, features = VariableFeatures(object = p53))

p <- DimPlot(p53, reduction = "pca", group.by = "Sample", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/4.PCA.pdf"), p, width = 9, height = 7)



##################### Harmony 
length(VariableFeatures(p53))
p53 <- RunHarmony(p53, "Sample", max.iter.harmony = 1, sigma = 0.1)
p53 <- FindNeighbors(p53, reduction = "harmony", dims = 1:20)
p53 <- FindClusters(p53, resolution = 1)
p53 <- RunUMAP(p53, reduction = "harmony", dims = 1:20)


set.seed(2)
kmeans.cluster <- stats::kmeans(p53@reductions$harmony@cell.embeddings[, 1:20], centers = 90, iter.max = 100)
summary(as.numeric(table(kmeans.cluster$cluster)))
p53$Kmeans <- kmeans.cluster$cluster


pt.size = 0.2

p <- DimPlot(p53, reduction = "umap", pt.size = pt.size, raster = FALSE,
             group.by = "Sample", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/5.UMAP.sample.pdf"), p, width = 8, height = 7)

p <- DimPlot(p53, reduction = "umap", pt.size = pt.size, raster = FALSE, label = T, label.size = 8,
             group.by = "seurat_clusters", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/5.UMAP.seurat_clusters.pdf"), p, width = 8.5, height = 7)

p <- DimPlot(p53, reduction = "umap", pt.size = pt.size, raster = FALSE, label = T, label.size = 3,
             group.by = "Kmeans") + theme_few()
ggsave(paste0(out.path, "/5.UMAP.kmeans.pdf"), p, width = 10, height = 7)

p <- DimPlot(p53, reduction = "umap", pt.size = pt.size, raster = FALSE,
             group.by = "seurat_clusters", split.by = "Sample", cols = color.lib, ncol = 4) + theme_few()
ggsave(paste0(out.path, "/5.UMAP.seurat_clusters.s.pdf"), p, width = 18, height = 9)

p <- DimPlot(p53, reduction = "umap", pt.size = pt.size, raster = FALSE, shuffle = TRUE,
             group.by = "Disease", cols = c("#003366","#db3434","#CCCCCC") ) + theme_few()
ggsave(paste0(out.path, "/5.UMAP.Disease.pdf"), p, width = 8, height = 7)

p <- DimPlot(p53, reduction = "umap", pt.size = pt.size, raster = FALSE, shuffle = TRUE,
             group.by = "Disease", cols = c("#db3434","#db3434","#CCCCCC") ) + theme_few()
ggsave(paste0(out.path, "/5.UMAP.Disease-shake.pdf"), p, width = 8, height = 7)



p <- DimPlot(p53, reduction = "umap", pt.size = pt.size, raster = FALSE, label = TRUE, label.size = 4,
             group.by = "CellType.BlueprintEncodeData", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/5.UMAP.SingleR.BlueprintEncodeData.pdf"), p, width = 10, height = 7)

p <- DimPlot(p53, reduction = "umap", pt.size = pt.size, raster = FALSE, label = TRUE, label.size = 4,
             group.by = "CellType.HumanPrimaryCellAtlasData", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/5.UMAP.SingleR.HumanPrimaryCellAtlasData.pdf"), p, width = 10, height = 7)

p <- DimPlot(p53, reduction = "umap", pt.size = pt.size, raster = FALSE, label = TRUE, label.size = 4,
             group.by = "CellType.NovershternHematopoieticData", cols = color.lib) + theme_few()
ggsave(paste0(out.path, "/5.UMAP.SingleR.NovershternHematopoieticData.pdf"), p, width = 10, height = 7)


p <- FeaturePlot(object = p53, features = c("CD34","MPO","CD14","FCGR3A","CD1C","IL3RA","FCGR3B","S100A8","PF4"), 
              cols = c("#CCCCCC", "red"), pt.size = 0.1, raster = TRUE, ncol = 3,
              reduction = "umap")
ggsave(paste0(out.path, "/6.cell.marker_Mye.pdf"), p, width = 19, height = 16)

p <- FeaturePlot(object = p53, features = c("CD79A","CD4","CD8A","CCR7","FOXP3","IL7R","FGFBP2","NCAM1","XCL1"), 
              cols = c("#CCCCCC", "red"), pt.size = 0.1, raster = TRUE, ncol = 3,
              reduction = "umap")
ggsave(paste0(out.path, "/6.cell.marker_NKT.pdf"), p, width = 19, height = 16)

2.7 Markers identification

Idents(p53) <- p53$CellType
all.markers <- FindAllMarkers(p53, only.pos = TRUE)
all.markers <- all.markers[which(all.markers$p_val_adj < 0.05), ]
write.xlsx(all.markers, paste0(out.path, "/1.topMarkers.CellType.xlsx"), rowNames = T, overwrite = T)


all.markers <- read.xlsx(paste0(out.path, "/1.topMarkers.CellType.xlsx"), rowNames = T)
top10 <- all.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
gene.list <- unique(top10$gene)
p <- DotPlot(p53, features = rev(gene.list), group.by = "CellType", dot.scale = 8, cols = c("#DDDDDD", "#003366" ), col.min = -1) + RotatedAxis()
p <- p + theme_base() + theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=20))
p <- p + theme(axis.text.y = element_text(size = 10))
p <- p + coord_flip() + scale_size(range = c(0, 6))
p <- p + gradient_color(c("#EEEEEE","#ffb459","#e8613c","#b70909"))
ggsave(paste0(out.path, "/Top.markers.CellType.pdf"), p, width = 8, height = 13)


gene.list <- c("CD4","CD8A","CD3E","FOXP3","IL2RA","DUSP4","TIGIT","RTKN2",
               "CCR7","SELL","CD40LG","IL7R","ITK","PDE3B",
               "GZMB","GZMA",
               "NKG7","FGFBP2","NCAM1","XCL1",
               "CD79A","MS4A1","TCL1A","BANK1",
               "IGHG1","IGHG3","IGHG4",
               "PTGDS","LILRA4","PLD4","SCT",
               "CD14","S100A8","S100A9","FCN1","LYZ","FCGR3A",
               "CD1C","FCER1A",
               "FCGR3B",
               "PTCRA","TUBB1","PF4","GP9",
               "HBB","HBA1","CA1")

p <- DotPlot(p53, features = rev(gene.list), group.by = "seurat_clusters", dot.scale = 8, cols = c("#DDDDDD", "#003366" ), col.min = -2) + RotatedAxis()
p <- p + theme_base() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 18))
p <- p + theme(axis.text.y = element_text(size = 12))
p <- p + coord_flip() #+ scale_size(range = c(1, 7))
p <- p + gradient_color(c("#EEEEEE","#ffb459","#e8613c","#b70909"))
ggsave(paste0(out.path, "/8.Top.markers.Cluster.sel.pdf"), p, width = 10, height = 14)

2.8 CellPhoneDB analysis

########### cellphonedb
out.path.ccc <- paste0(out.path, "/ccc_all_state")
system(sprintf("mkdir %s", out.path.ccc))

gene.id <- read.table("gene.id.v40.txt")
gene.id.coding <- gene.id[which(gene.id$V6 == "protein_coding"), ]


out.data <- p53@assays$RNA@data[, which(!is.na(p53$CellTypeAnno))]
out.data <- out.data[rowSums(out.data) > 0, ]
out.data <- out.data[rownames(out.data) %in% gene.id.coding$V7, ]

out.data <- as.data.frame(out.data)
out.data <- round(out.data, 5)
out.ccc.mat <- data.frame(Gene = rownames(out.data), out.data)



out.meta <- p53@meta.data[colnames(out.data), c("CellType","CellTypeAnno")]
colnames(out.meta) <- c("Cell", "cell_type")
out.meta$Cell = rownames(out.meta)
out.meta$Cell <- str_replace_all(out.meta$Cell, "-", ".")


sum(out.meta$Cell != colnames(out.ccc.mat)[2:ncol(out.ccc.mat)])

write.table(out.ccc.mat, paste0(out.path.ccc, "/exp.mat.txt"), row.names = F, col.names = T, sep = "\t", quote = F)
write.table(out.meta, paste0(out.path.ccc, "/meta.data.txt"), row.names = F, col.names = T, sep = "\t", quote = F)

'
cellphonedb method statistical_analysis meta.data.txt exp.mat.txt --threads=20 --iterations=100 --counts-data=gene_name --output-path=out.merge_tumor_T


'