Chapter 5 PRO-Data

  • Data preprocessing
library(OlinkAnalyze)
library(tidyverse)
library(openxlsx)
#----------------------------------------------------------------------------------
#  Step 1: Load raw Olink data
#----------------------------------------------------------------------------------
data(npx_data1) # OlinkAnalyze 包内置demo数据
dat_olink <- npx_data1 %>% filter(PlateID=="Example_Data_1_CAM.csv") # long format
#   SampleID Index OlinkID  UniProt Assay MissingFreq Panel_Version PlateID       QC_Warning   LOD   NPX
#   <chr>    <int> <chr>    <chr>   <chr>       <dbl> <chr>         <chr>         <chr>      <dbl> <dbl>
# 1 A1           1 OID01216 O00533  CHL1       0.0188 v.1201        Example_Data… Pass        2.37 13.0 
# 2 A2           2 OID01216 O00533  CHL1       0.0188 v.1201        Example_Data… Pass        2.37 11.3 
# 3 A3           3 OID01216 O00533  CHL1       0.0188 v.1201        Example_Data… Pass        2.37 25.5 
# 4 A4           4 OID01216 O00533  CHL1       0.0188 v.1201        Example_Data… Pass        2.37 14.5 
# 5 A5           5 OID01216 O00533  CHL1       0.0188 v.1201        Example_Data… Pass        2.37  7.63
# 6 A6           6 OID01216 O00533  CHL1       0.0188 v.1201        Example_Data… Pass        2.37  6.32
dat_olink <- dat_olink %>% filter(!str_detect(SampleID,'CONTROL_SAMPLE'))
#----------------------------------------------------------------------------------
#  Step 2: add clinical data
#----------------------------------------------------------------------------------
# cli <- data_frame(ID  = unique(dat_olink$SampleID),
# Age = sample(60:100, length(unique(dat_olink$SampleID)), replace = TRUE),
# Gender = sample(c("Female","Male"), length(unique(dat_olink$SampleID)), replace = TRUE))
# write.xlsx(cli,"./data/20231129_olink_cli.xlsx")
cli <- read.xlsx("./data/20231129_olink_cli.xlsx")
#   ID Age Gender
# 1 A1  95   Male
# 2 A2  87 Female
# 3 A3  82 Female
# 4 A4  65 Female
# 5 A5  88 Female
# 6 A6  62 Female
dat_olink <- dat_olink %>% left_join(cli,by = c("SampleID"="ID"))
dat_olink <- dat_olink %>% mutate(Group=case_match(Treatment,
                                                   "Treated"~"Treated",
                                                   "Untreated"~"Untreated",
                                                   .default = "QC"))
# 分组情况
dat_olink %>% distinct(SampleID,Group) %>%  count(Group)
# 查看Warning样本
dat_olink %>% filter(QC_Warning=="Warning") %>% pull(SampleID) %>% unique() # A sample will get a QC warning (not pass the quality control) if Incubation Control 2 or Detection Control (corresponding to that specific sample) deviates more than a pre-determined value (+/- 0.3) from the median value of all samples on the plate.
#----------------------------------------------------------------------------------
#  Step 3: exclude proteins with MissingFreq > 60%
#----------------------------------------------------------------------------------
# https://olink.com/faq/how-is-the-limit-of-detection-lod-estimated-and-handled/
dat_clean <- dat_olink %>% filter(MissingFreq <= 0.6)       ## 需根据数据调整阈值  # excluse 0 protein
#----------------------------------------------------------------------------------
#  Step 4: output
#----------------------------------------------------------------------------------
## long format
dat_clean
write.xlsx(dat_clean,"./data/20231123_olink_long.xlsx")
saveRDS(dat_clean,"./data/20231123_olink_long.rds")
## wide format
dat_clean_wide <- dat_clean %>% dplyr::select(SampleID,NPX,Assay) %>% 
                                pivot_wider(names_from = "SampleID",
                                            values_from = "NPX") %>% 
                                as.data.frame() %>% column_to_rownames("Assay")
#                A1        A2        A3        A4        A5        A6         A7         A8
# CHL1   12.9561426 11.269477 25.451070 14.453038  7.628712  6.316586 12.7306116 13.6663354
# NRP1    3.7297092  6.144801  6.950510  3.725393  7.198343  7.789239 10.8019996  9.8789692
# PLXNB2  2.0861223  1.483985  1.228214  3.278853  4.989030  4.654764  1.1875243  0.6908001
# FCGR3B 11.6096290 17.582141 10.494904 14.969670 13.087540 11.463464  7.9634394  0.1154297
# LILRB5  0.7241916  3.329488  4.392219  1.132330  1.237068  2.342279 -0.2936439  1.8003573
# APOM    8.5426236 10.395668 14.669500 16.194933 15.074472 13.904607  6.7865950  7.7481516
write.xlsx(dat_clean_wide,"./data/20231123_olink_wide.xlsx",rowNames=T,colNames=T,overwrite=T)
saveRDS(dat_clean_wide,"./data/20231123_olink_wide.rds")
## meta 
meta <- data.frame(ID = colnames(dat_clean_wide))
meta <- meta %>% left_join(cli,by="ID")
meta <- meta %>% left_join(dat_clean %>% dplyr::select(SampleID,Group) %>% distinct(SampleID,Group),by=c("ID"="SampleID"))
rownames(meta) <- meta$ID
identical(rownames(meta),colnames(dat_clean_wide))
saveRDS(meta,"./data/20231123_olink_meta.rds")
#    ID Age Gender     Group
# A1 A1  95   Male Untreated
# A2 A2  87 Female Untreated
# A3 A3  82 Female Untreated
# A4 A4  65 Female Untreated
# A5 A5  88 Female Untreated
# A6 A6  62 Female Untreated