Chapter 6 PRO-Analysis
6.1 QC
- If a sample is an outlier in the PCA, NPX distribution, and QC plots, that this might indicate a true outlier.
library(OlinkAnalyze)
library(tidyverse)
library(openxlsx)
library(patchwork)
dir.create("./results/PRO",recursive = T)
#----------------------------------------------------------------------------------
# Step 1: Load clean Olink data
#----------------------------------------------------------------------------------
dat_clean <- readRDS("./data/20231123_olink_long.rds") %>% as_tibble()
# SampleID Index OlinkID UniProt Assay MissingFreq Panel_Version PlateID QC_Warning LOD NPX
# <chr> <int> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl>
# 1 A1 1 OID012… O00533 CHL1 0.0188 v.1201 Exampl… Pass 2.37 13.0
# 2 A2 2 OID012… O00533 CHL1 0.0188 v.1201 Exampl… Pass 2.37 11.3
# 3 A3 3 OID012… O00533 CHL1 0.0188 v.1201 Exampl… Pass 2.37 25.5
# 4 A4 4 OID012… O00533 CHL1 0.0188 v.1201 Exampl… Pass 2.37 14.5
# 5 A5 5 OID012… O00533 CHL1 0.0188 v.1201 Exampl… Pass 2.37 7.63
# 6 A6 6 OID012… O00533 CHL1 0.0188 v.1201 Exampl… Pass 2.37 6.32
#----------------------------------------------------------------------------------
# Step 2: PCA plot: check QC_Warning samples & Group
#----------------------------------------------------------------------------------
p1 <- dat_clean %>% olink_pca_plot(label_samples = T, quiet = T)
p2 <- dat_clean %>% olink_pca_plot(color_g = "Group", quiet = T)
p <- p1[[1]] + p2[[1]]
ggsave("./results/PRO/1.PCA_QC.png", p, width = 12, height = 5)
## PCA plot: 3 SD from the mean of the plotted PC
# Samples more than +/-outlierDef[3,3] standard deviations from the mean of the plotted PC will be labelled.
p <- dat_clean %>% olink_pca_plot(outlierDefX = 3, outlierDefY = 3, outlierLines = T, label_outliers = T, quiet = T)
ggsave("./results/PRO/1.PCA_QC_outlierDef_3sd.png", p[[1]], width = 6, height = 4)
#----------------------------------------------------------------------------------
# Step 3: Boxplot: NPX distribution plot
#----------------------------------------------------------------------------------
p1 <- dat_clean %>% olink_dist_plot() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
p2 <- dat_clean %>% olink_dist_plot(color_g = "Group") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
p <- p1 + p2 + plot_layout(ncol = 1)
ggsave("./results/PRO/2.Box_distribution.png", p, width = 12, height = 7)
#----------------------------------------------------------------------------------
# Step 4: QC plot: IQR_median
#----------------------------------------------------------------------------------
# IQR: interquartile range, 四分位距
# dashed lines: +/-3 standard deviations
p1 <- dat_clean %>% olink_qc_plot()
p2 <- dat_clean %>% olink_qc_plot(color_g = "Group")
p <- p1 + p2
ggsave("./results/PRO/3.IQR_qc.png", p, width = 12, height = 5)
-注:是否应排除这些异常值取决于潜在异常值与其他样本的不同程度,以及异常值是否可以追溯到生物学、样本特定或技术问题。综合上述几个分析,本测试数据的QC_Warning样本予以保留。
6.2 Differential Protein Expression Analysis
library(rstatix)
#----------------------------------------------------------------------------------
# Step 1: Basic statistical analysis on clinical data: Age & Gender
#----------------------------------------------------------------------------------
dat_clean %>% dplyr::select(Group, Gender) %>% table() %>% chisq_test() ## 性别在分组之间有显著差异,需要进行矫正
# n statistic p df method p.signif
# * <int> <dbl> <dbl> <int> <chr> <chr>
# 1 7084 261. 1.24e-58 1 Chi-square test ****
dat_clean %>% t_test( Age ~ Group , detailed = T)
# estimate estimate1 estimate2 .y. group1 group2 n1 n2 statistic p df conf.low conf.high
# * <dbl> <dbl> <dbl> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0.227 79.2 79.0 Age Treated Untre… 2208 4876 0.692 0.489 4007. -0.416 0.871
#----------------------------------------------------------------------------------
# Step 2: T test, no adjustment
#----------------------------------------------------------------------------------
# perform t-test
dat_clean <- dat_clean %>% mutate(Group = factor(Group,levels=c("Treated","Untreated")))
ttest_results <- olink_ttest(df = dat_clean, variable = "Group")
# cutoff: adj.P.Val < 0.05 && |FC| > 1.5
ttest_results <- ttest_results %>%
dplyr::rename("log2FC" = "estimate") %>%
mutate(Group = case_when( Adjusted_pval<0.05&log2FC>0.58 ~ "up",
Adjusted_pval<0.05&log2FC< -0.58 ~ "down",
.default = "not sig"))
# output
write.xlsx(ttest_results, file = "./results/PRO/4.deps_ttest_results.xlsx")
table(ttest_results$Group)
- Olink deps results (no adjustment): 4.deps_ttest_results.xlsx
#----------------------------------------------------------------------------------
# Step 3: limma, adjusted by Gender
#----------------------------------------------------------------------------------
library(limma)
# data: wide format
exp <- readRDS("./data/20231123_olink_wide.rds")
meta <- readRDS("./data/20231123_olink_meta.rds")
identical(rownames(meta),colnames(exp))
# limma
meta$contrast <- as.factor(meta$Group)
design <- model.matrix(~ 0 + contrast + Gender , data = meta)
fit <- lmFit(exp, design)
contrast <- makeContrasts( limma_adjAge = contrastTreated - contrastUntreated ,
levels = design)
fits <- contrasts.fit(fit, contrast)
ebFit <- eBayes(fits)
# result
# KO_WT
res <- topTable(ebFit, coef = "limma_adjAge", adjust.method = 'fdr', number = Inf)
limma.res <- res %>% filter(!is.na(adj.P.Val)) %>%
mutate( logP = -log10(P.Value) ) %>%
mutate( contrast = "Treated_Untreated") %>%
mutate( logFDR = -log10(adj.P.Val) ) %>%
mutate( GeneSymbol = rownames(res)) %>%
dplyr::select(GeneSymbol,everything()) %>%
as_tibble()
## cutoff: adj.P.Val < 0.05 && |FC| > 1.5
limma.res <- limma.res %>% mutate(group = case_when( adj.P.Val<0.05&logFC>0.58 ~ "up",
adj.P.Val<0.05&logFC< -0.58 ~ "down",
.default = "not sig"))
limma.res %>% count(group)
## output
write.xlsx( limma.res, "./results/PRO/5.deps_limma_adjAge_results.xlsx", overwrite = T, rowNames = F)
其他分析及可视化部分可参考RNA-seq分析