Chapter 5 PRO-Data
- Data preprocessing
library(OlinkAnalyze)
library(tidyverse)
library(openxlsx)
#----------------------------------------------------------------------------------
# Step 1: Load raw Olink data
#----------------------------------------------------------------------------------
data(npx_data1) # OlinkAnalyze 包内置demo数据
dat_olink <- npx_data1 %>% filter(PlateID=="Example_Data_1_CAM.csv") # long format
# SampleID Index OlinkID UniProt Assay MissingFreq Panel_Version PlateID QC_Warning LOD NPX
# <chr> <int> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl>
# 1 A1 1 OID01216 O00533 CHL1 0.0188 v.1201 Example_Data… Pass 2.37 13.0
# 2 A2 2 OID01216 O00533 CHL1 0.0188 v.1201 Example_Data… Pass 2.37 11.3
# 3 A3 3 OID01216 O00533 CHL1 0.0188 v.1201 Example_Data… Pass 2.37 25.5
# 4 A4 4 OID01216 O00533 CHL1 0.0188 v.1201 Example_Data… Pass 2.37 14.5
# 5 A5 5 OID01216 O00533 CHL1 0.0188 v.1201 Example_Data… Pass 2.37 7.63
# 6 A6 6 OID01216 O00533 CHL1 0.0188 v.1201 Example_Data… Pass 2.37 6.32
dat_olink <- dat_olink %>% filter(!str_detect(SampleID,'CONTROL_SAMPLE'))
#----------------------------------------------------------------------------------
# Step 2: add clinical data
#----------------------------------------------------------------------------------
# cli <- data_frame(ID = unique(dat_olink$SampleID),
# Age = sample(60:100, length(unique(dat_olink$SampleID)), replace = TRUE),
# Gender = sample(c("Female","Male"), length(unique(dat_olink$SampleID)), replace = TRUE))
# write.xlsx(cli,"./data/20231129_olink_cli.xlsx")
cli <- read.xlsx("./data/20231129_olink_cli.xlsx")
# ID Age Gender
# 1 A1 95 Male
# 2 A2 87 Female
# 3 A3 82 Female
# 4 A4 65 Female
# 5 A5 88 Female
# 6 A6 62 Female
dat_olink <- dat_olink %>% left_join(cli,by = c("SampleID"="ID"))
dat_olink <- dat_olink %>% mutate(Group=case_match(Treatment,
"Treated"~"Treated",
"Untreated"~"Untreated",
.default = "QC"))
# 分组情况
dat_olink %>% distinct(SampleID,Group) %>% count(Group)
# 查看Warning样本
dat_olink %>% filter(QC_Warning=="Warning") %>% pull(SampleID) %>% unique() # A sample will get a QC warning (not pass the quality control) if Incubation Control 2 or Detection Control (corresponding to that specific sample) deviates more than a pre-determined value (+/- 0.3) from the median value of all samples on the plate.
#----------------------------------------------------------------------------------
# Step 3: exclude proteins with MissingFreq > 60%
#----------------------------------------------------------------------------------
# https://olink.com/faq/how-is-the-limit-of-detection-lod-estimated-and-handled/
dat_clean <- dat_olink %>% filter(MissingFreq <= 0.6) ## 需根据数据调整阈值 # excluse 0 protein
#----------------------------------------------------------------------------------
# Step 4: output
#----------------------------------------------------------------------------------
## long format
dat_clean
write.xlsx(dat_clean,"./data/20231123_olink_long.xlsx")
saveRDS(dat_clean,"./data/20231123_olink_long.rds")
## wide format
dat_clean_wide <- dat_clean %>% dplyr::select(SampleID,NPX,Assay) %>%
pivot_wider(names_from = "SampleID",
values_from = "NPX") %>%
as.data.frame() %>% column_to_rownames("Assay")
# A1 A2 A3 A4 A5 A6 A7 A8
# CHL1 12.9561426 11.269477 25.451070 14.453038 7.628712 6.316586 12.7306116 13.6663354
# NRP1 3.7297092 6.144801 6.950510 3.725393 7.198343 7.789239 10.8019996 9.8789692
# PLXNB2 2.0861223 1.483985 1.228214 3.278853 4.989030 4.654764 1.1875243 0.6908001
# FCGR3B 11.6096290 17.582141 10.494904 14.969670 13.087540 11.463464 7.9634394 0.1154297
# LILRB5 0.7241916 3.329488 4.392219 1.132330 1.237068 2.342279 -0.2936439 1.8003573
# APOM 8.5426236 10.395668 14.669500 16.194933 15.074472 13.904607 6.7865950 7.7481516
write.xlsx(dat_clean_wide,"./data/20231123_olink_wide.xlsx",rowNames=T,colNames=T,overwrite=T)
saveRDS(dat_clean_wide,"./data/20231123_olink_wide.rds")
## meta
meta <- data.frame(ID = colnames(dat_clean_wide))
meta <- meta %>% left_join(cli,by="ID")
meta <- meta %>% left_join(dat_clean %>% dplyr::select(SampleID,Group) %>% distinct(SampleID,Group),by=c("ID"="SampleID"))
rownames(meta) <- meta$ID
identical(rownames(meta),colnames(dat_clean_wide))
saveRDS(meta,"./data/20231123_olink_meta.rds")
# ID Age Gender Group
# A1 A1 95 Male Untreated
# A2 A2 87 Female Untreated
# A3 A3 82 Female Untreated
# A4 A4 65 Female Untreated
# A5 A5 88 Female Untreated
# A6 A6 62 Female Untreated
Olink clean data, long format: 20231123_olink_long.xlsx
Olink clean data, wide format: 20231123_olink_wide.xlsx