library(tidyverse)
library(data.table)
library(Biobase)
library(sva)
library(limma)
library(MicrobiomeAnalysis)
library(SummarizedExperiment)
library(snm)
library(ggpubr)
library(cowplot)
rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)
grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#8AC786", "#B897CA")
grp_shapes <- c(15, 16)
9 数据校正
批次效应在生物学数据分析中是一个普遍存在的问题,它指的是由于实验过程中非生物学因素(如样本处理时间、实验条件、测序平台等)的差异,导致实验结果中混入与研究目标不相关的变异。在比较对照组和实验组时,这些非生物学因素可能引入额外的噪声,影响对生物学问题真实效应的判断。
因此,在本章节中,采用了不同的降低批次效应的方法:
limma::voom
&snm::snm
:来自(Ritchie 等 2015)和(Mecham, Nelson, 和 Storey 2010),参考文章(Poore 等 2020);
9.1 加载R包
使用rm(list = ls())
来清空环境中的所有变量。
9.2 导入数据
输入数据来自于 章节 8
9.3 准备数据
- 准备三种校正批次效应的方法所需的输入数据。
metadata <- rbind(LIRI_JP_meta, TCGA_LIHC_meta)
profile <- LIRI_JP_eprs %>%
tibble::rownames_to_column("GeneID") %>%
dplyr::inner_join(TCGA_LIHC_eprs %>%
tibble::rownames_to_column("GeneID"),
by = "GeneID") %>%
tibble::column_to_rownames("GeneID")
head(as.matrix(profile[, 1:5]))
- 准备批次校正的因素及其矩阵
9.4 ComBat
sva::ComBat
(Leek 和 Storey 2007)是一种常用的批次校正算法,用于调整高通量数据中的批次效应。
9.5 removeBatchEffect
limma::removeBatchEffect
(Ritchie 等 2015)是另一种常用的批次校正算法,它的原理基于线性模型和对批次效应的建模。
9.6 Voom SNM
集合limma::voom
(Ritchie 等 2015) & snm::snm
(Mecham, Nelson, 和 Storey 2010)
get_VoomSNM <- function(
qcMetadata,
qcProfile,
batchVar,
groupVar){
# qcMetadata = dat_meta
# qcProfile = dat_raw
# batchVar = "ProjectID"
# groupVar = "Group"
colnames(qcMetadata)[which(colnames(qcMetadata) == batchVar)] <- "Batch"
colnames(qcMetadata)[which(colnames(qcMetadata) == groupVar)] <- "Group"
# Set up counts matrix
counts <- qcProfile # DGEList object from a table of counts (rows=features, columns=samples)
snmData <- snmDataObjOnly$norm.dat
return(snmData)
}
VoomSNM_data <- get_VoomSNM(
qcMetadata = dat_meta,
qcProfile = dat_raw,
batchVar = "ProjectID",
groupVar = "Group")
head(VoomSNM_data[, 1:5])
9.7 批次效应校正结果比较
MicrobiomeAnalysis
(Zou 2022)
get_plot <- function(
datMeta,
datProf,
group = "Group",
group_names = grp_names,
group_colors = grp_colors,
method = "PCA",
shape_var = "ProjectID",
shape_values = grp_shapes) {
datProf <- as.data.frame(datProf)
sid <- intersect(rownames(datMeta), colnames(datProf))
phen <- datMeta[rownames(datMeta) %in% sid, , ]
counts <- datProf %>%
dplyr::select(all_of(rownames(phen))) %>%
as.matrix()
if (!all(rownames(phen) == colnames(counts))) {
stop("wrong order of samples")
}
se <- SummarizedExperiment(
assays = SimpleList(counts = counts),
colData = phen
)
return(pl)
}
- 合并数据集的原始数据PCA结果
# 计算消耗时间太久,设置该代码避免重复运算
if (file.exists("./data/result/Figure/Data_RawData_pl.RDS")) {
RawData_pl <- readRDS("./data/result/Figure/Data_RawData_pl.RDS")
} else {
RawData_pl <- get_plot(
datMeta = metadata,
datProf = profile,
group = "Group",
method = "PCA",
shape_var = "ProjectID")
saveRDS(RawData_pl, "./data/result/Figure/Data_RawData_pl.RDS", compress = TRUE)
}
RawData_pl
- 合并数据集的
sva::ComBat
数据PCA结果
# 计算消耗时间太久,设置该代码避免重复运算
if (file.exists("./data/result/Figure/Data_ComBat_pl.RDS")) {
ComBat_pl <- readRDS("./data/result/Figure/Data_ComBat_pl.RDS")
} else {
ComBat_pl <- get_plot(
datMeta = metadata,
datProf = ComBat_data,
group = "Group",
method = "PCA",
shape_var = "ProjectID")
saveRDS(ComBat_pl, "./data/result/Figure/Data_ComBat_pl.RDS", compress = TRUE)
}
ComBat_pl
- 合并数据集的
limma::removeBatchEffect
数据PCA结果
# 计算消耗时间太久,设置该代码避免重复运算
if (file.exists("./data/result/Figure/Data_RME_pl.RDS")) {
RME_pl <- readRDS("./data/result/Figure/Data_RME_pl.RDS")
} else {
RME_pl <- get_plot(
datMeta = metadata,
datProf = RME_data,
group = "Group",
method = "PCA",
shape_var = "ProjectID")
saveRDS(RME_pl, "./data/result/Figure/Data_RME_pl.RDS", compress = TRUE)
}
RME_pl
- 合并数据集的
Voom+SNM
数据PCA结果
# 计算消耗时间太久,设置该代码避免重复运算
if (file.exists("./data/result/Figure/Data_VoomSNM_pl.RDS")) {
VoomSNM_pl <- readRDS("./data/result/Figure/Data_VoomSNM_pl.RDS")
} else {
VoomSNM_pl <- get_plot(
datMeta = metadata,
datProf = VoomSNM_data,
group = "Group",
method = "PCA",
shape_var = "ProjectID")
saveRDS(VoomSNM_pl, "./data/result/Figure/Data_VoomSNM_pl.RDS", compress = TRUE)
}
VoomSNM_pl
- 合并所有的图
为了便于观察不同批次校正方法的结果,需要将上述生成的图形进行合并。
adjusted_pl <- cowplot::plot_grid(
RawData_pl, ComBat_pl, RME_pl, VoomSNM_pl,
ncol = 2, align = "hv",
labels = LETTERS[1:4])
RawData_pl
ComBat_pl
RME_pl
VoomSNM_pl
结果:
- 根据观察,经过
Voom+SNM
方法校正后的基因表达谱数据分布并不倾向于按项目聚类,相较于其他两种方法,效果更佳。
9.8 校正后的结果
对上述三种方法校正后表达谱数据存成ExpressionSet
,以便后续分析。另外,采用生态学常用评估整体变异和组间关系的PERMANOVA
(Anderson 2014)检验方法评价校正前后的结果。
get_ExprSet <- function(
x,
y,
occurrence = 0.5) {
# metadata Description
sid <- dplyr::intersect(colnames(x), y$SampleID)
phen <- y[pmatch(sid, y$SampleID), , ]
prof <- x %>%
as.data.frame() %>%
dplyr::select(dplyr::all_of(phen$SampleID))
# determine the right order between profile and phenotype
for (i in 1:ncol(prof)) {
if (!(colnames(prof)[i] == rownames(phen)[i])) {
stop(paste0(i, " Wrong"))
}
}
expressionSet <- new("ExpressionSet",
exprs = exprs,
phenoData = adf,
experimentData = experimentData)
return(expressionSet)
}
- RawData:\(R^2\)结果
if (!dir.exists("./data/result/BatchEffect/")) {
dir.create("./data/result/BatchEffect/", recursive = TRUE)
}
if (file.exists("./data/result/BatchEffect/Raw_PERMANOVA.RDS")) {
Raw_PERMANOVA <- readRDS("./data/result/BatchEffect/Raw_PERMANOVA.RDS")
} else {
saveRDS(Raw_PERMANOVA, "./data/result/BatchEffect/Raw_PERMANOVA.RDS", compress = TRUE)
}
Raw_PERMANOVA
结果:基于PERMANOVA分析结果,Pr(>F) < 0.05 表明Group
和ProjectID
均与整体转录组结构存在显著差异。同时,ProjectID
解释了整体转录组结构变异的 69.3%,这表明批次效应在整体转录组结构中占据了相当大的比例。因此,在后续关于Group
的差异研究中,必须进行批次校正,以确保研究结果的准确性和可靠性。
- sva::ComBat: 校正后的R2结果
#| warning: false
#| message: false
ComBat_ExprSet <- get_ExprSet(
x = ComBat_data,
y = metadata,
occurrence = 0.5)
if (file.exists("./data/result/BatchEffect/ComBat_PERMANOVA.RDS")) {
ComBat_PERMANOVA <- readRDS("./data/result/BatchEffect/ComBat_PERMANOVA.RDS")
} else {
saveRDS(ComBat_PERMANOVA, "./data/result/BatchEffect/ComBat_PERMANOVA.RDS", compress = TRUE)
}
ComBat_PERMANOVA
结果:sva::ComBat
方法均降低了Group
和ProjectID
的解释的变异比例。ProjectID
的整体转录组结构变异从 69.3%降低至22.8%,而Group
则从1.6%降低至1.3%,这表明该方法同时对批次效应和生物学效应产生了影响。
- limma::removeBatchEffect
#| warning: false
#| message: false
RME_ExprSet <- get_ExprSet(
x = RME_data,
y = metadata,
occurrence = 0.5)
if (file.exists("./data/result/BatchEffect/RME_PERMANOVA.RDS")) {
RME_PERMANOVA <- readRDS("./data/result/BatchEffect/RME_PERMANOVA.RDS")
} else {
saveRDS(RME_PERMANOVA, "./data/result/BatchEffect/RME_PERMANOVA.RDS", compress = TRUE)
}
RME_PERMANOVA
结果:limma::removeBatchEffect
方法也均降低了Group
和ProjectID
的解释的变异比例。ProjectID
的整体转录组结构变异从 69.3%降低至9.7%,而Group
则从1.6%降低至1%,这表明该方法同时对批次效应和生物学效应产生了影响。该方法在降低批次效应要好于sva::ComBat
。
- Voom+SNM
VoomSNM_ExprSet <- get_ExprSet(
x = VoomSNM_data,
y = metadata,
occurrence = 0.5)
if (file.exists("./data/result/BatchEffect/VoomSNM_PERMANOVA.RDS")) {
VoomSNM_PERMANOVA <- readRDS("./data/result/BatchEffect/VoomSNM_PERMANOVA.RDS")
} else {
saveRDS(VoomSNM_PERMANOVA, "./data/result/BatchEffect/VoomSNM_PERMANOVA.RDS", compress = TRUE)
}
VoomSNM_PERMANOVA
结果:Voom+SNM
方法也均降低了Group
和ProjectID
的解释的变异比例。ProjectID
的整体转录组结构变异从 69.3%降低至0.3%,而Group
则从1.6%降低至1.3%,这表明该方法同时对批次效应和生物学效应产生了影响。相比其他两种方法(sva::ComBat
和limma::removeBatchEffect
),Voom+SNM
不仅显著降低批次效应并且也最大程度保留生物学效应。
9.9 输出校正后的结果
#| warning: false
#| message: false
if (!dir.exists("./data/result/ExpSetObject")) {
dir.create("./data/result/ExpSetObject", recursive = TRUE)
}
# saveRDS(ComBat_ExprSet, "./data/result/ExpSetObject/MergeExpSet_ComBat_LIRI-JP_TCGA-LIHC.RDS", compress = TRUE)
# saveRDS(RME_ExprSet, "./data/result/ExpSetObject/MergeExpSet_RME_LIRI-JP_TCGA-LIHC.RDS", compress = TRUE)
# saveRDS(MMUPHin_ExprSet, "./data/result/ExpSetObject/MergeExpSet_MMUPHin_LIRI-JP_TCGA-LIHC.RDS", compress = TRUE)
# saveRDS(ComBatSeq_ExprSet, "./data/result/ExpSetObject/MergeExpSet_ComBatSeq_LIRI-JP_TCGA-LIHC.RDS", compress = TRUE)
saveRDS(VoomSNM_ExprSet, "./data/result/ExpSetObject/MergeExpSet_VoomSNM_VoomSNM_LIRI-JP_TCGA-LIHC.RDS", compress = TRUE)
if (!dir.exists("./data/result/Figure")) {
dir.create("./data/result/Figure", recursive = TRUE)
}
ggsave("./data/result/Figure/SFig1.pdf", adjusted_pl, width = 10, height = 7, dpi = 600)
9.10 总结
在评估了不同批次效应校正方法后,发现Voom + SNM
的组合在降低批次效应方面表现最佳,同时能够有效地保留生物学效应。这一组合不仅显著减少了由于非生物学因素导致的变异,还确保了数据中生物学信号的完整性和准确性。因此,决定使用Voom + SNM
校正后的表达谱数据,并将其保存为RDS文件格式,以便直接用于后续的下游分析。这样,能够基于准确且可靠的数据集,进行更深入的生物学研究。
系统信息
R version 4.3.3 (2024-02-29)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.2
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: Asia/Shanghai
tzcode source: internal
attached base packages:
[1] stats4 stats graphics grDevices datasets utils methods
[8] base
other attached packages:
[1] cowplot_1.1.3 ggpubr_0.6.0
[3] snm_1.50.0 SummarizedExperiment_1.32.0
[5] GenomicRanges_1.54.1 GenomeInfoDb_1.38.8
[7] IRanges_2.36.0 S4Vectors_0.40.2
[9] MatrixGenerics_1.14.0 matrixStats_1.3.0
[11] MicrobiomeAnalysis_1.0.3 limma_3.58.1
[13] sva_3.50.0 BiocParallel_1.36.0
[15] genefilter_1.84.0 mgcv_1.9-1
[17] nlme_3.1-164 Biobase_2.62.0
[19] BiocGenerics_0.48.1 data.table_1.15.4
[21] lubridate_1.9.3 forcats_1.0.0
[23] stringr_1.5.1 dplyr_1.1.4
[25] purrr_1.0.2 readr_2.1.5
[27] tidyr_1.3.1 tibble_3.2.1
[29] ggplot2_3.5.1 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] fs_1.6.4 bitops_1.0-7
[3] DirichletMultinomial_1.44.0 httr_1.4.7
[5] RColorBrewer_1.1-3 doParallel_1.0.17
[7] numDeriv_2016.8-1.1 tools_4.3.3
[9] doRNG_1.8.6 backports_1.4.1
[11] utf8_1.2.4 R6_2.5.1
[13] vegan_2.6-4 lazyeval_0.2.2
[15] rhdf5filters_1.14.1 permute_0.9-7
[17] withr_3.0.0 gridExtra_2.3
[19] cli_3.6.2 sandwich_3.1-0
[21] mvtnorm_1.2-4 proxy_0.4-27
[23] yulab.utils_0.1.4 foreign_0.8-86
[25] scater_1.30.1 decontam_1.22.0
[27] readxl_1.4.3 rstudioapi_0.16.0
[29] RSQLite_2.3.6 shape_1.4.6.1
[31] generics_0.1.3 gtools_3.9.5
[33] car_3.1-2 Matrix_1.6-5
[35] biomformat_1.30.0 ggbeeswarm_0.7.2
[37] fansi_1.0.6 DescTools_0.99.54
[39] DECIPHER_2.30.0 abind_1.4-5
[41] lifecycle_1.0.4 multcomp_1.4-25
[43] yaml_2.3.8 edgeR_4.0.16
[45] carData_3.0-5 gplots_3.1.3.1
[47] rhdf5_2.46.1 SparseArray_1.2.4
[49] grid_4.3.3 blob_1.2.4
[51] crayon_1.5.2 lattice_0.22-6
[53] beachmat_2.18.1 annotate_1.80.0
[55] KEGGREST_1.42.0 pillar_1.9.0
[57] knitr_1.46 boot_1.3-30
[59] gld_2.6.6 corpcor_1.6.10
[61] codetools_0.2-19 glue_1.7.0
[63] MultiAssayExperiment_1.28.0 vctrs_0.6.5
[65] png_0.1-8 treeio_1.26.0
[67] Rdpack_2.6 cellranger_1.1.0
[69] gtable_0.3.5 cachem_1.0.8
[71] xfun_0.43 rbibutils_2.2.16
[73] S4Arrays_1.2.1 metagenomeSeq_1.43.0
[75] survival_3.7-0 SingleCellExperiment_1.24.0
[77] iterators_1.0.14 statmod_1.5.0
[79] bluster_1.12.0 gmp_0.7-4
[81] TH.data_1.1-2 ANCOMBC_2.4.0
[83] phyloseq_1.46.0 bit64_4.0.5
[85] irlba_2.3.5.1 KernSmooth_2.23-22
[87] vipor_0.4.7 rpart_4.1.23
[89] colorspace_2.1-0 DBI_1.2.2
[91] Hmisc_5.1-2 nnet_7.3-19
[93] ade4_1.7-22 Exact_3.2
[95] DESeq2_1.42.1 tidyselect_1.2.1
[97] bit_4.0.5 compiler_4.3.3
[99] glmnet_4.1-8 htmlTable_2.4.2
[101] BiocNeighbors_1.20.2 expm_0.999-9
[103] DelayedArray_0.28.0 caTools_1.18.2
[105] checkmate_2.3.1 scales_1.3.0
[107] digest_0.6.35 minqa_1.2.6
[109] rmarkdown_2.26 XVector_0.42.0
[111] htmltools_0.5.8.1 pkgconfig_2.0.3
[113] base64enc_0.1-3 lme4_1.1-35.3
[115] sparseMatrixStats_1.14.0 fastmap_1.1.1
[117] rlang_1.1.3 htmlwidgets_1.6.4
[119] DelayedMatrixStats_1.24.0 zoo_1.8-12
[121] jsonlite_1.8.8 energy_1.7-11
[123] BiocSingular_1.18.0 RCurl_1.98-1.14
[125] magrittr_2.0.3 Formula_1.2-5
[127] scuttle_1.12.0 GenomeInfoDbData_1.2.11
[129] Rhdf5lib_1.24.2 munsell_0.5.1
[131] Rcpp_1.0.12 ape_5.8
[133] viridis_0.6.5 CVXR_1.0-12
[135] stringi_1.8.4 rootSolve_1.8.2.4
[137] zlibbioc_1.48.2 MASS_7.3-60.0.1
[139] plyr_1.8.9 parallel_4.3.3
[141] ggrepel_0.9.5 lmom_3.0
[143] Biostrings_2.70.3 splines_4.3.3
[145] multtest_2.58.0 hms_1.1.3
[147] locfit_1.5-9.9 igraph_2.0.3
[149] ggsignif_0.6.4 Wrench_1.20.0
[151] rngtools_1.5.2 reshape2_1.4.4
[153] ScaledMatrix_1.10.0 XML_3.99-0.16.1
[155] evaluate_0.23 renv_1.0.0
[157] BiocManager_1.30.23 nloptr_2.0.3
[159] tzdb_0.4.0 foreach_1.5.2
[161] rsvd_1.0.5 broom_1.0.5
[163] xtable_1.8-4 Rmpfr_0.9-5
[165] e1071_1.7-14 tidytree_0.4.6
[167] rstatix_0.7.2 viridisLite_0.4.2
[169] class_7.3-22 gsl_2.1-8
[171] lmerTest_3.1-3 memoise_2.0.1
[173] beeswarm_0.4.0 AnnotationDbi_1.66.0
[175] cluster_2.1.6 TreeSummarizedExperiment_2.10.0
[177] timechange_0.3.0 mia_1.10.0