Chapter 3 Bulk

3.1 Pre-processing

In bulk RNA-seq analysis, Salmon was used to generate the count and transcripts per kilobase of exon model per million mapped reads (TPM) matrix. The DESeq2, limma, and edgeR packages were used to calculate differentially expressed genes (DEGs) with significance level setting as P-value < 0.05.

#!/bin/bash
#SBATCH -p CPU # partition (queue)
#SBATCH --job-name=CART
#SBATCH -n 8
#SBATCH --array=1-50
#SBATCH -t 7-00:00 # time (D-HH:MM)
#SBATCH -o _log/mm.%N.%A_%a.out # STDOUT
#SBATCH -e _log/mm.%N.%A_%a.err # STDERR
#SBATCH --mail-type=END,FAIL # notifications for job done & fail
#SBATCH --mail-user=XX # send-to address

id=`sed -n ${SLURM_ARRAY_TASK_ID}p ../cart.sample.txt`
echo "${id}"

fq_path=${fq_path}

fq1=${fq_path}/${id}_R1.fq.gz
fq2=${fq_path}/${id}_R2.fq.gz

gtf_file=${hg38_ref}/gencode.v37.annotation.gtf
out_path=${hg38_ref}/salmon
salmon_index=${hg38_ref}/gencode.v37.almon

$SALMON_1_3 quant -p 40 -l IU -i ${salmon_index} -o ${out_path}/${id} -1 ${fq1} -2 ${fq2} -g ${gtf_file} --gcBias --validateMappings

Then the TPM and count matrices were generated.

3.2 DEGs using DESeq2

exp <- filter_count51

# test
comp <- c("Day0", "Day3~5")
pd1_com_t <- pd1[pd1$group1 %in% compare_t,]
exp_com_t <- exp[,match(pd1_com_t$ID,colnames(exp))]
Group_t <- factor(pd1_com_t$group1,
                  levels = compare_t) 

colData <- data.frame(row.names =colnames(exp_com_t), 
                      condition=Group_t)

dds <- DESeqDataSetFromMatrix(
  countData = exp_com_t,
  colData = colData,
  design = ~ condition) 
dds <- DESeq(dds)

res <- results(dds, contrast = c("condition",rev(levels(Group_t))))
resOrdered <- res[order(res$pvalue),] 
DEG_t <- as.data.frame(resOrdered)
head(DEG_t)


# calculation
id <- unique(pd1$group1)[order(unique(pd1$group1))]#由小到大排序
deg <- list() 
n <- 0
for (i in 1:(length(id)-1)){
  for(is in ((i+1):length(id))){
    comp <- id[c(i,is)]
    n <- n+1
    pd1_com <- pd1[pd1$group1 %in% comp,]
    exp_com <- exp[,match(pd1_com$ID,colnames(exp))] 
    Group <- factor(pd1_com$group1,
                    levels = comp)

    colData <- data.frame(row.names =colnames(exp_com), 
                          condition=Group)
    
    dds <- DESeqDataSetFromMatrix(
      countData = exp_com,
      colData = colData, 
      design = ~ condition) 
    dds <- DESeq(dds)

    res <- results(dds, contrast = c("condition",rev(levels(Group))))
    resOrdered <- res[order(res$pvalue),]
    deg[[n]] <- as.data.frame(resOrdered)
    names(deg)[[n]] <- paste0(comp[1],"Vs",comp[2])
    
  }
}

tr <- c()
clus <- list()
gene_id <- rownames(exp)

for (i in 1:length(deg)){
  te <- data.frame(Symbol=rownames(deg[[i]]),adj.P.Val=deg[[i]]$padj)
  colnames(te)[2] <- names(deg)[[i]]
  clus[[i]] <- te[match(gene_id,te[,1]),]
  names(clus)[[i]] <- names(deg)[[i]]
  tr[i] <- identical(gene_id,clus[[i]][,1])
}
table(tr)

deg_all <- bind_cols(clus) 
rownames(deg_all) <- gene_id
deg_all <- deg_all[,!str_detect(colnames(deg_all),"Symbol")]


pan <- c()
for (i in 1:length(rownames(deg_all))){
  pan[i] <- sum(deg_all[i,]<0.05,na.rm = TRUE)>0
}
table(pan)
deg_deseq2_ch <- deg_all[pan,]

3.3 DEGs using Limma

exp <- filter_count51

# test
comp <- c("Day0", "Day3~5")
pd1_com <- pd1[pd1$group1 %in% comp,]
exp_com <- exp[,match(pd1_com$ID,colnames(exp))] 
identical(pd1_com$ID,colnames(exp_com))
Group_t <- factor(pd1_com$group1,
                  levels =comp) 
Group_t 
Group_ts <- ifelse(pd1_com$group1==comp[1],"control","treat")
Group_ts <- factor(Group_ts,
                   levels = c("control","treat"))
Group_ts 

design <- model.matrix(~0+Group_ts)
colnames(design)=levels(Group_ts)
rownames(design)=colnames(exp_com)
dge <- DGEList(counts=exp_com)
dge <- calcNormFactors(dge)

v <- voom(dge,design, normalize="quantile")
fit <- lmFit(v, design)

constrasts = paste(rev(levels(Group_ts)),collapse = "-")
cont.matrix <- makeContrasts(contrasts=constrasts,levels = design) 
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)

deg_t= topTable(fit2, coef=constrasts, n=Inf)

test <- as.data.frame(t(filter_fpkm51[rownames(filter_fpkm51)=="CHI3L2",pd1_com$ID]))
test$Group_t <- Group_t
ggplot(data = test,mapping = aes(x=Group_t,y=CHI3L2))+
  geom_boxplot()


### calculation
id <- unique(pd1$group1)[order(unique(pd1$group1))]
id
deg <- list() 
n <- 0
for (i in 1:(length(id)-1)){
  for(is in ((i+1):length(id))){
    comp <- id[c(i,is)]
    n <- n+1
    
    pd1_com <- pd1[pd1$group1 %in% comp,]
    exp_com <- exp[,match(pd1_com$ID,colnames(exp))] 
    identical(pd1_com$ID,colnames(exp_com))
    
    Group <- factor(pd1_com$group1,
                    levels =comp)
    Group 
    
    Group_s <- ifelse(pd1_com$group1==comp[1],"control","treat")
    Group_s <- factor(Group_s,
                      levels = c("control","treat"))
    Group_s 

    design <- model.matrix(~0+Group_s)
    colnames(design)=levels(Group_s)
    rownames(design)=colnames(exp_com)

    dge <- DGEList(counts=exp_com)
    dge <- calcNormFactors(dge)
    
    v <- voom(dge,design, normalize="quantile") 
    fit <- lmFit(v, design)
    
    constrasts = paste(rev(levels(Group_s)),collapse = "-")
    cont.matrix <- makeContrasts(contrasts=constrasts,levels = design) 
    fit2=contrasts.fit(fit,cont.matrix)
    fit2=eBayes(fit2)
    
    deg[[n]]= topTable(fit2, coef=constrasts, n=Inf)
    
    names(deg)[[n]] <- paste0(comp[1],"Vs",comp[2])
  }
}

rownames(exp)
tr <- c()
clus <- list()
gene_id <- rownames(exp)
for (i in 1:length(deg)){
  te <- data.frame(Symbol=rownames(deg[[i]]),adj.P.Val=deg[[i]]$adj.P.Val)#te作为中介
  colnames(te)[2] <- names(deg)[[i]]
  clus[[i]] <- te[match(gene_id,te[,1]),]
  names(clus)[[i]] <- names(deg)[[i]]
  tr[i] <- identical(gene_id,clus[[i]][,1])
}
table(tr)
deg_all <- bind_cols(clus) 
rownames(deg_all) <- gene_id
deg_all <- deg_all[,!str_detect(colnames(deg_all),"Symbol")]


pan <- c()
for (i in 1:length(rownames(deg_all))){
  pan[i] <- sum(deg_all[i,]<0.05)>0
}
table(pan)
deg_limma_ch <- deg_all[pan,]

3.4 DEGs using edgeR

exp <- filter_count51


# test
comp <- c("Day0", "Day3~5")
pd1_com <- pd1[pd1$group1 %in% comp,]#得到分组pd
exp_com <- exp[,match(pd1_com$ID,colnames(exp))] 
Group_t <- factor(pd1_com$group1,
                  levels = comp) 

dge <- DGEList(counts=exp_com,group=Group_t) 
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge) 

design <- model.matrix(~0+Group_t)
rownames(design)<-colnames(dge)
colnames(design)<-levels(Group_t)

dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)

fit <- glmFit(dge, design)
fit2 <- glmLRT(fit, contrast=c(-1,1)) 

DEG=topTags(fit2, n=nrow(exp))
deg_t=as.data.frame(DEG) 



# calculation
id <- unique(pd1$group1)[order(unique(pd1$group1))]
id
deg <- list() 
n <- 0
for (i in 1:(length(id)-1)){
  for(is in ((i+1):length(id))){
    comp <- id[c(i,is)]
    n <- n+1
    pd1_com <- pd1[pd1$group1 %in% comp,]
    exp_com <- exp[,match(pd1_com$ID,colnames(exp))] 
    Group <- factor(pd1_com$group1,
                    levels = comp) 
    
    dge <- DGEList(counts=exp_com,group=Group) 
    dge$samples$lib.size <- colSums(dge$counts)
    dge <- calcNormFactors(dge) 
    
    design <- model.matrix(~0+Group)
    rownames(design)<-colnames(dge)
    colnames(design)<-levels(Group)
    
    dge <- estimateGLMCommonDisp(dge, design)
    dge <- estimateGLMTrendedDisp(dge, design)
    dge <- estimateGLMTagwiseDisp(dge, design)
    
    fit <- glmFit(dge, design)
    fit2 <- glmLRT(fit, contrast=c(-1,1)) 
    
    DEG=topTags(fit2, n=nrow(exp))
    deg[[n]]=as.data.frame(DEG) 
    
    names(deg)[[n]] <- paste0(comp[1],"Vs",comp[2])
  }
}

tr <- c()
clus <- list()
gene_id <- rownames(deg[[1]])

for (i in 1:length(deg)){
  te <- data.frame(Symbol=rownames(deg[[i]]),adj.P.Val=deg[[i]]$FDR)#te作为中介
  colnames(te)[2] <- names(deg)[[i]]
  clus[[i]] <- te[match(gene_id,te[,1]),]#以gene_id固定相同的顺序
  names(clus)[[i]] <- names(deg)[[i]]
  tr[i] <- identical(gene_id,clus[[i]][,1])
}
table(tr)
deg_all <- bind_cols(clus) 
rownames(deg_all) <- gene_id
deg_all <- deg_all[,!str_detect(colnames(deg_all),"Symbol")]

pan <- c()
for (i in 1:length(rownames(deg_all))){
  pan[i] <- sum(deg_all[i,]<0.05,na.rm = TRUE)>0
}
table(pan)
deg_edgeR_ch <- deg_all[pan,]

3.5 Mfuzz

limma_sigGene <- rownames(deg_limma_ch)
edgeR_sigGene <- rownames(deg_edgeR_ch)
DESeq2_sigGene <- rownames(deg_deseq2_ch)

data <- list(limma=limma_sigGene,
             edgeR=edgeR_sigGene,
             DESeq2=DESeq2_sigGene)

library(VennDiagram)

col <- c('#0099CC','#FF6666','#FFCC99')

venn.diagram(data,
             lwd=1,
             lty=1,
             col=col,
             fill=col,
             cat.col=col,
             cat.cex = 1.8,
             rotation.degree = 0,
             cex=1.5,
             alpha = 0.5,
             reverse=TRUE,
             width=4000,height = 4000,resolution =600,margin=0.2,
             filename="d21_mfuzz/3DEG_venn.png",imagetype="png")


sigGene3 = intersect(intersect(limma_sigGene,
                               edgeR_sigGene),
                     DESeq2_sigGene)


eset <- new("ExpressionSet",exprs = DEGs_exp_averp)
eset <- filter.std(eset,min.std=0) 
eset <- standardise(eset)

c <-8
m <- mestimate(eset) 
set.seed(123)
cl <- mfuzz(eset, c = c, m = m) 

cl$size
sum(cl$size)

gene2cluster <- data.frame(gene=names(cl$cluster),cluster=cl$cluster)

head(cl$membership)
O <- overlap(cl)
Ptmp <-  overlap.plot(cl,over=O,thres=0.05)


library(RColorBrewer)
library(Mfuzz)
color.2 <- colorRampPalette(rev(c("#ff0000", "Yellow", "OliveDrab1")))(1000)

mfuzz.plot2(eset,cl,mfrow=c(2,4),new.window= FALSE,centre = TRUE,
            time.labels=colnames(DEGs_exp_averp),colo = color.2)

3.6 GSEA

clus.logfc <- list() #log2fc
for (i in 1:6) {
  clus.logfc[[i]]<- deg[[i]]$log2FoldChange
  names(clus.logfc[[i]]) <- rownames(deg[[i]])
  names(clus.logfc)[i] <- names(deg)[i]
}

i = 1
geneList <- clus.logfc[[i]]
geneList <- sort(geneList, decreasing = T)
egmt <- GSEA(geneList, TERM2GENE=geneset, verbose=T,pvalueCutoff = 1)