11 差异分析之limma
基因表达差异分析是一种用来比较不同条件下基因表达水平的方法,其目的是发现在不同条件下表达水平有显著差异的基因。这种分析通常在生物学研究中用于识别与特定生物过程、疾病或环境因素相关的基因。
11.1 加载R包
使用rm(list = ls())
来清空环境中的所有变量。
11.2 读取函数
determine_tables <- function(
object = NULL,
data_counts = NULL,
data_metadata = NULL,
group,
group_names = NULL,
p_adjust = c("none", "fdr", "bonferroni", "holm",
"hochberg", "hommel", "BH", "BY"),
da_method = NULL) {
# p_adjust
p_adjust <- match.arg(p_adjust,
c("none", "fdr", "bonferroni", "holm",
"hochberg", "hommel", "BH", "BY")
)
# determine to use the ExpressionSet-class object
if (all(!is.null(object), !is.null(data_counts), !is.null(data_metadata))) {
message("Either ExpressionSet object or counts and metadata as inputdata")
message("Use the ExpressionSet object as default in the following processes")
}
# whether to extract tables from ExpressionSet
if (!is.null(object)) {
counts <- Biobase::exprs(object) %>%
data.frame()
metadata <- Biobase::pData(object) %>%
data.frame()
} else if (!is.null(data_counts)) {
counts <- data_counts %>%
data.frame()
if (!is.null(data_metadata)) {
metadata <- data_metadata %>%
data.frame()
} else {
stop("Please load sample data")
}
} else {
stop("Please load ExpressionSet and counts")
}
# check whether group is valid
meta_nms <- names(metadata)
if (!group %in% meta_nms) {
stop(
group, " are not contained in the `pData` of `ExpressionSet`",
call. = FALSE
)
}
# groups split
if (!is.null(group_names)) {
if (length(grep(":", group_names)) == 0) {
group_names <- group_names
} else {
group_names <- unlist(strsplit(group_names, ":", fixed = TRUE))
}
}
# metadata table: row -> SampleID; column -> phenotypic index
sample_data_df <- metadata %>%
data.frame()
colnames(sample_data_df)[which(colnames(sample_data_df) == group)] <- "Compvar"
if (!is.null(group_names)) {
sam_tab <- sample_data_df %>%
tibble::rownames_to_column("TempRowNames") %>%
dplyr::filter(Compvar %in% group_names) %>%
tibble::column_to_rownames("TempRowNames")
sam_tab$Compvar <- factor(sam_tab$Compvar, levels = group_names)
} else {
sam_tab <- sample_data_df
group_names <- as.character(unique(sam_tab$Compvar))
sam_tab$Compvar <- factor(sam_tab$Compvar, levels = group_names)
}
# check the levels of group are 2
if (length(unique(sam_tab$Compvar)) != 2 ) {
stop(
group, " contains more than 2 levels in `ps`, please input the names of comparison",
call. = FALSE
)
}
count_tab <- counts[, match(rownames(sam_tab), colnames(counts))]
# remove gene with constant in groups
## Judge whether the data are not 0 per groups
mdat <- dplyr::inner_join(sam_tab %>%
tibble::rownames_to_column("TempRowNames") %>%
dplyr::select(dplyr::all_of(c("TempRowNames", "Compvar"))),
count_tab %>%
t() %>% data.frame() %>%
tibble::rownames_to_column("TempRowNames"),
by = "TempRowNames") %>%
tibble::column_to_rownames("TempRowNames")
occ_fun <- function(x){
length(x[c(which(!is.na(x) & x != 0))])/length(x)
}
mdat_zero_occ <- mdat %>% dplyr::group_by(Compvar) %>%
dplyr::summarise(across(everything(), ~ occ_fun(.))) %>%
tibble::column_to_rownames("Compvar") %>%
t() %>% data.frame()
# change & (and) into | (or): 7/19/2022
mdat_zero_occ$KEEP <- ifelse(mdat_zero_occ[, 1] > 0.1 | mdat_zero_occ[, 2] > 0.1, TRUE, FALSE)
nfeature <- rownames(mdat_zero_occ)[mdat_zero_occ$KEEP]
mdat_remain <- mdat[, colnames(mdat) %in% c("Compvar", nfeature)]
## Judge whether the data are identical (such as all equal=1)
idential_res <- apply(mdat_remain[, -1], 2, function(x) {
length(unique(x)[unique(x) != 0]) != 1
}) %>%
data.frame()
nfeature_idential <- rownames(idential_res)[idential_res$.]
mdat_remain_v2 <- mdat_remain[, colnames(mdat_remain) %in% c("Compvar", nfeature_idential)]
## remove value appears to be constant with groups
## but the strict filtered criterion would remove too many taxa if the cutoff is 3
constant_res <- mdat_remain_v2 %>% dplyr::group_by(Compvar) %>%
dplyr::summarise(across(.cols = everything(), .fns = function(x){length(unique(x)) > 0})) %>%
tibble::column_to_rownames("Compvar") %>%
t() %>% data.frame()
nfeature_no_constant <- data.frame()
for (i in 1:nrow(constant_res)) {
nfeature_no_constant <- rbind(nfeature_no_constant,
data.frame(FeatureID=rownames(constant_res)[i],
KEEP=all(constant_res[i, 1], constant_res[i, 2])))
}
nfeature_no_constant_final <- rownames(idential_res)[nfeature_no_constant$KEEP]
mdat_remain_final <- mdat_remain[, colnames(mdat_remain) %in%
c("Compvar", nfeature_no_constant_final)]
## final tables
otu_tab_final <- mdat_remain_final %>%
dplyr::select(-Compvar) %>%
t() %>% data.frame()
sam_tab_final <- sam_tab[pmatch(colnames(otu_tab_final), rownames(sam_tab)), , F]
# return results
res <- list(count = otu_tab_final,
phen = sam_tab_final,
nf = NULL,
lib_size = NULL,
group_names = group_names)
return(res)
}
calculate_occurrence <- function(profile, metadata, groups) {
if (!all(rownames(metadata) == colnames(profile))) {
stop("Order of sampleID between colData and proData is wrong please check your inputdata")
}
res <- apply(profile, 1, function(x, y) {
dat <- data.frame(value = as.numeric(x), group = y)
occ <- tapply(dat$value, dat$group, function(x){
length(x[c(which(!is.na(x) & x != 0))]) / length(x)
}) %>%
data.frame() %>% stats::setNames("occ") %>%
tibble::rownames_to_column("Group")
occ1 <- occ[occ$Group %in% groups[1], "occ"]
occ2 <- occ[occ$Group %in% groups[2], "occ"]
occall <- length(dat$value[c(which(!is.na(dat$value) & dat$value != 0))]) / length(dat$value)
# 100%
occ1_percentage <- round(occ1, 4) * 100
occ2_percentage <- round(occ2, 4) * 100
occall_percentage <- round(occall, 4) * 100
res <- c(occall_percentage, occ1_percentage, occ2_percentage)
return(res)
}, metadata$Compvar) %>%
t() %>% data.frame() %>%
tibble::rownames_to_column("FeatureID")
colnames(res) <- c("FeatureID",
"Occurrence (100%)\n(All)",
paste0("Occurrence (100%)\n", groups))
return(res)
}
run_OddRatio <- function(datx, daty, GroupName) {
# glm result for odd ratios 95%CI
mdat <- dplyr::inner_join(datx %>% tibble::rownames_to_column("TempRowNames") %>%
dplyr::select(dplyr::all_of(c("TempRowNames", "Compvar"))),
daty %>% t() %>% data.frame() %>%
tibble::rownames_to_column("TempRowNames"),
by = "TempRowNames") %>%
tibble::column_to_rownames("TempRowNames")
# Judge whether the data are not 0 per groups
occ_fun <- function(x){
length(x[c(which(!is.na(x) & x != 0))]) / length(x)
}
mdat_zero_occ <- mdat %>%
dplyr::group_by(Compvar) %>%
dplyr::summarise(across(everything(), ~ occ_fun(.))) %>%
tibble::column_to_rownames("Compvar") %>%
t() %>% data.frame()
mdat_zero_occ$KEEP <- ifelse(mdat_zero_occ[, 1] > 0 & mdat_zero_occ[, 2] > 0, TRUE, FALSE)
nfeature <- rownames(mdat_zero_occ)[mdat_zero_occ$KEEP]
mdat_remain <- mdat[, colnames(mdat) %in% c("Compvar", nfeature)]
# Judge whether the data are identical (such as all equal=1)
idential_res <- apply(mdat_remain[, -1], 2, function(x) {
length(unique(x)[unique(x) != 0]) != 1
}) %>%
data.frame()
nfeature_idential <- rownames(idential_res)[idential_res$.]
mdat_remain_final <- mdat_remain[, colnames(mdat_remain) %in% c("Compvar", nfeature_idential)]
dat_phe <- mdat_remain_final %>%
dplyr::select(Compvar) %>%
dplyr::mutate(Compvar = ifelse(Compvar == GroupName[2], 1, 0))
dat_prf <- mdat_remain_final %>% dplyr::select(-Compvar)
glmFun <- function(GroupN, MarkerN) {
MarkerN[MarkerN == 0] <- min(MarkerN[MarkerN != 0])
dat_glm <- data.frame(group = GroupN,
marker = scale(MarkerN, center = TRUE, scale = TRUE)) %>%
na.omit()
model <- summary(stats::glm(group ~ marker, data = dat_glm,
family = binomial(link = "logit")))
res <- signif(exp(model$coefficients["marker", 1]) +
qnorm(c(0.025, 0.5, 0.975)) * model$coefficients["marker", 1], 2)
return(res)
}
glm_res <- t(apply(dat_prf, 2, function(x, group) {
res <- glmFun(group, as.numeric(x))
return(res)
}, dat_phe$Compvar))
Odd <- glm_res %>% data.frame() %>%
setNames(c("upper", "expected", "lower")) %>%
dplyr::mutate("Odds Ratio (95% CI)" = paste0(expected, " (", lower, ";", upper, ")"))
Odd$FeatureID <- rownames(glm_res)
res_ratain <- Odd[, c(5, 4)]
# drop features
drop_features <- dplyr::setdiff(rownames(daty), nfeature_idential)
if (length(drop_features) > 0) {
res_drop <- data.frame(FeatureID = drop_features,
Value = NA) %>%
stats::setNames(c("FeatureID", "Odds Ratio (95% CI)"))
res <- rbind(res_ratain, res_drop)
} else {
res <- res_ratain
}
return(res)
}
calculate_median_abundance <- function(profile, metadata, groups) {
if (!all(rownames(metadata) == colnames(profile))) {
stop("Order of sampleID between colData and proData is wrong please check your inputdata")
}
res <- apply(profile, 1, function(x, y) {
dat <- data.frame(value = as.numeric(x), group = y)
mn <- tapply(dat$value, dat$group, median) %>%
data.frame() %>% setNames("value") %>%
tibble::rownames_to_column("Group")
mn1 <- with(mn, mn[Group %in% groups[1], "value"])
mn2 <- with(mn, mn[Group %in% groups[2], "value"])
mnall <- median(dat$value)
if (all(mn1 != 0, mn2 != 0)) {
Log2median <- log2(mn1 / mn2)
} else {
Log2median <- NA
}
res <- c(Log2median, mnall, mn1, mn2)
return(res)
}, metadata$Compvar) %>%
t() %>% data.frame() %>%
tibble::rownames_to_column("FeatureID")
colnames(res) <- c("FeatureID",
paste0("Log2FoldChange (Median)\n", paste(groups, collapse = "_vs_")),
"Median Abundance\n(All)",
paste0("Median Abundance\n", groups))
return(res)
}
calculate_mean_abundance <- function(profile, metadata, groups) {
if (!all(rownames(metadata) == colnames(profile))) {
stop("Order of sampleID between colData and proData is wrong please check your inputdata")
}
res <- apply(profile, 1, function(x, y) {
dat <- data.frame(value = as.numeric(x), group = y)
mn <- tapply(dat$value, dat$group, mean) %>%
data.frame() %>% stats::setNames("value") %>%
tibble::rownames_to_column("Group")
mn1 <- with(mn, mn[Group %in% groups[1], "value"])
mn2 <- with(mn, mn[Group %in% groups[2], "value"])
mnall <- mean(dat$value)
if (all(mn1 != 0, mn2 != 0)) {
Log2mean <- log2(mn1 / mn2)
} else {
Log2mean <- NA
}
res <- c(Log2mean, mnall, mn1, mn2)
return(res)
}, metadata$Compvar) %>%
t() %>% data.frame() %>%
tibble::rownames_to_column("FeatureID")
colnames(res) <- c("FeatureID",
paste0("Log2FoldChange (Mean)\n", paste(groups, collapse = "_vs_")),
"Mean Abundance\n(All)",
paste0("Mean Abundance\n", groups))
return(res)
}
create_contrast <- function(groups, contrast = NULL) {
if (!is.factor(groups)) {
groups <- factor(groups)
}
lvl <- levels(groups)
n_lvl <- length(lvl)
if (n_lvl < 2) {
stop("Differential analysis requires at least two groups.")
}
if (n_lvl == 2) { # two groups
if (!is.null(contrast)) {
warning(
"`contrast` is ignored, you do not need to set it",
call. = FALSE
)
}
design <- rep(0, n_lvl)
design[1] <- -1
design[2] <- 1
}
return(design)
}
上述函数可以存到本地utilities.R
脚本,可以通过source(“utilities.R”)加载函数。
11.3 导入数据
-
ExpressionSet
来自于 章节 9
11.4 差异分析函数
#| warning: false
#| message: false
run_limma_voom <- function(
object = NULL,
data_counts = NULL,
data_metadata = NULL,
group,
group_names = NULL,
voom_span = 0.5,
p_adjust = "BH",
qvalue_cutoff = 0.05) {
# extract tables
dat_tables <- determine_tables(
object = object,
data_counts = data_counts,
data_metadata = data_metadata,
group = group,
group_names = group_names,
p_adjust = p_adjust)
otu_tab <- dat_tables$count
sam_tab <- dat_tables$phen
nf <- dat_tables$nf
lib_size <- colSums(dat_tables$count)
group_names <- dat_tables$group_names
# calculate occurrence for group each taxa
occurrence_taxa <- calculate_occurrence(otu_tab, sam_tab, group_names)
# calculate median abundance per group for enriched directory
median_abundance <- calculate_median_abundance(otu_tab, sam_tab, group_names)
# calculate mean abundance per group
mean_abundance <- calculate_mean_abundance(otu_tab, sam_tab, group_names)
# 95% CI Odd Ratio
odd_Ratio <- run_OddRatio(sam_tab, otu_tab, group_names)
# building contrast object
lvl <- levels(sam_tab$Compvar)
n_lvl <- length(lvl)
groups <- sam_tab$Compvar
contrast <- create_contrast(groups, group_names)
# design matrix
design <- model.matrix(~ 0 + groups)
# DA analysis:
res_temp <- data.frame(
FeatureID = rownames(test_df),
Enrichment = enrich_group,
EffectSize = ef,
logFC = test_df$logFC,
Pvalue = test_df$P.Value,
AdjustedPvalue = test_df$adj.P.Val) %>%
dplyr::inner_join(median_abundance, by = "FeatureID") %>%
dplyr::inner_join(mean_abundance, by = "FeatureID") %>%
dplyr::inner_join(occurrence_taxa, by = "FeatureID")
res_temp$Enrichment <- ifelse(res_temp$AdjustedPvalue < qvalue_cutoff,
res_temp$Enrichment, "Nonsignif")
# Number of Group
dat_status <- table(sam_tab$Compvar)
dat_status_number <- as.numeric(dat_status)
dat_status_name <- names(dat_status)
res_temp$Block <- paste(paste(dat_status_number[1], dat_status_name[1], sep = "_"),
"vs",
paste(dat_status_number[2], dat_status_name[2], sep = "_"))
if (nrow(odd_Ratio) != 0) {
res_final <- res_temp[, c(1, 18, 2:17)] %>%
dplyr::inner_join(odd_Ratio, by = "FeatureID")
} else {
res_final <- res_temp[, c(1, 18, 2:17)]
}
return(res_final)
}
11.5 运行差异分析
- 设置对应参数,运行
run_limma_voom
if (!dir.exists("./data/result/DA/")) {
dir.create("./data/result/DA/", recursive = TRUE)
}
if (file.exists("./data/result/DA/HCC_Early_vs_Late_limma.csv")) {
Early_vs_Late <- readr::read_csv("./data/result/DA/HCC_Early_vs_Late_limma.csv")
} else {
Early_vs_Late <- run_limma_voom(
object = ExprSet,
group = "Group",
group_names = c("Early Stage", "Late Stage"))
write.csv(Early_vs_Late, "./data/result/DA/HCC_Early_vs_Late_limma.csv", row.names = F)
}
head(Early_vs_Late)
结果:输出结果包含了19列。它们分别是:
列名 | 中文名称 | 含义 |
---|---|---|
FeatureID | 特征名称 | 基因名 |
Block | 区块 | 分组检验数目 |
Enrichment | 富集方向 | 简单的富集方向 |
EffectSize | 组间效应值 | 两组均值差异比值 |
logFC | 组间log倍数 | limma的logFC |
Pvalue | 检验P值 | limma的pvalue |
AdjustedPvalue | 检验BH值 | limma的多重检验校正后的pvalue |
Log2FoldChange (Median) Early Stage_vs_Late Stage | 中位数的log2倍数 | |
Median Abundance (All) | 中位数丰度 | |
Median Abundance Early Stage | 中位数Early组丰度 | |
Median Abundance Late Stage | 中位数Late组丰度 | |
Log2FoldChange (Mean) Early Stage_vs_Late Stage | 组间log倍数 | |
Mean Abundance (All) | 平均值丰度 | |
Mean Abundance Early Stage | 平均值Early组丰度 | |
Mean Abundance Late Stage | 平均值Late组丰度 | |
Occurrence (100%) (All) | 出现率 | |
Occurrence (100%) Early Stage | Early组出现率 | |
Occurrence (100%) Late Stage | Late组出现率 | |
Odds Ratio (95% CI) | 比值比 | 95%置信区间比值比 |
- 对 小节 11.5 上述结果进行过滤,挑选显著差异的特征
qvalue < 0.05;
|log2foldchange| >= 0.5;
|log2foldchange| <= 50;
prevalence > 50
qval_cutoff <- 0.05
lgfc_cutoff <- 0.5
lgfc_max <- 50
prev_cutoff <- 50
Early_vs_Late_signif <- Early_vs_Late %>%
dplyr::filter(AdjustedPvalue < qval_cutoff) %>%
dplyr::filter(abs(logFC) >= lgfc_cutoff) %>%
dplyr::filter(abs(logFC) <= lgfc_max) %>%
dplyr::filter(`Occurrence (100%)\nEarly Stage` > prev_cutoff) %>%
dplyr::filter(`Occurrence (100%)\nLate Stage` > prev_cutoff)
# dplyr::filter(`Occurrence..100...Early.Stage` > prev_cutoff) %>%
# dplyr::filter(`Occurrence..100...Late.Stage` > prev_cutoff)
table(Early_vs_Late_signif$Enrichment)
结果:分别获得富集在Early和Late分组的差异基因,这些差异基因将用于后续特征选择。
11.6 输出结果
11.7 总结
在基因标记的探索和识别过程中,确保选取合适的差异分析方法和评估差异基因的指标是至关重要的步骤。为了有效识别出具有统计学显著性的基因差异,本节研究采用了广泛认可和应用的limma
差异分析包。
系统信息
R version 4.3.3 (2024-02-29)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.2
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: Asia/Shanghai
tzcode source: internal
attached base packages:
[1] stats graphics grDevices datasets utils methods base
other attached packages:
[1] limma_3.58.1 Biobase_2.62.0 BiocGenerics_0.48.1
[4] data.table_1.15.4 lubridate_1.9.3 forcats_1.0.0
[7] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2
[10] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
[13] ggplot2_3.5.1 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] gtable_0.3.5 jsonlite_1.8.8 compiler_4.3.3
[4] BiocManager_1.30.23 renv_1.0.0 tidyselect_1.2.1
[7] scales_1.3.0 statmod_1.5.0 yaml_2.3.8
[10] fastmap_1.1.1 R6_2.5.1 generics_0.1.3
[13] knitr_1.46 htmlwidgets_1.6.4 munsell_0.5.1
[16] tzdb_0.4.0 pillar_1.9.0 rlang_1.1.3
[19] utf8_1.2.4 stringi_1.8.4 xfun_0.43
[22] timechange_0.3.0 cli_3.6.2 withr_3.0.0
[25] magrittr_2.0.3 digest_0.6.35 grid_4.3.3
[28] rstudioapi_0.16.0 hms_1.1.3 lifecycle_1.0.4
[31] vctrs_0.6.5 evaluate_0.23 glue_1.7.0
[34] fansi_1.0.6 colorspace_2.1-0 rmarkdown_2.26
[37] tools_4.3.3 pkgconfig_2.0.3 htmltools_0.5.8.1