11 差异分析之limma

基因表达差异分析是一种用来比较不同条件下基因表达水平的方法，其目的是发现在不同条件下表达水平有显著差异的基因。这种分析通常在生物学研究中用于识别与特定生物过程、疾病或环境因素相关的基因。

11.1 加载R包

使用rm(list = ls())来清空环境中的所有变量。

library(tidyverse)
library(data.table)
library(Biobase)
library(limma)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#8AC786", "#B897CA")
grp_shapes <- c(15, 16)

11.2 读取函数

determine_tables <- function(
    object = NULL,
    data_counts = NULL,
    data_metadata = NULL,
    group,
    group_names = NULL,
    p_adjust = c("none", "fdr", "bonferroni", "holm",
                 "hochberg", "hommel", "BH", "BY"),
    da_method = NULL) {

  # p_adjust
  p_adjust <- match.arg(p_adjust,
                        c("none", "fdr", "bonferroni", "holm",
                          "hochberg", "hommel", "BH", "BY")
  )
  
  # determine to use the ExpressionSet-class object
  if (all(!is.null(object), !is.null(data_counts), !is.null(data_metadata))) {
    message("Either ExpressionSet object or counts and metadata as inputdata")
    message("Use the ExpressionSet object as default in the following processes")
  }
  
  # whether to extract tables from ExpressionSet
  if (!is.null(object)) {
    counts <- Biobase::exprs(object) %>%
      data.frame()
    metadata <- Biobase::pData(object) %>%
      data.frame()
  } else if (!is.null(data_counts)) {
    counts <- data_counts %>% 
      data.frame()
    if (!is.null(data_metadata)) {
      metadata <- data_metadata %>%
        data.frame()
    } else {
      stop("Please load sample data")
    }
  } else {
    stop("Please load ExpressionSet and counts")
  }

  # check whether group is valid
  meta_nms <- names(metadata)
  if (!group %in% meta_nms) {
    stop(
      group, " are not contained in the `pData` of `ExpressionSet`",
      call. = FALSE
    )
  }
  
  # groups split
  if (!is.null(group_names)) {
    if (length(grep(":", group_names)) == 0) {
      group_names <- group_names
    } else {
      group_names <- unlist(strsplit(group_names, ":", fixed = TRUE))
    }
  }
  
  #  metadata table: row -> SampleID; column -> phenotypic index
  sample_data_df <- metadata %>%
    data.frame()
  colnames(sample_data_df)[which(colnames(sample_data_df) == group)] <- "Compvar"
  if (!is.null(group_names)) {
    sam_tab <- sample_data_df %>%
      tibble::rownames_to_column("TempRowNames") %>%
      dplyr::filter(Compvar %in% group_names) %>%
      tibble::column_to_rownames("TempRowNames")
    sam_tab$Compvar <- factor(sam_tab$Compvar, levels = group_names)
  } else {
    sam_tab <- sample_data_df
    group_names <- as.character(unique(sam_tab$Compvar))
    sam_tab$Compvar <- factor(sam_tab$Compvar, levels = group_names)
  }
  
  # check the levels of group are 2
  if (length(unique(sam_tab$Compvar)) != 2 ) {
    stop(
      group, " contains more than 2 levels in `ps`, please input the names of comparison",
      call. = FALSE
    )
  }
  count_tab <- counts[, match(rownames(sam_tab), colnames(counts))]
  
  # remove gene with constant in groups
  ## Judge whether the data are not 0 per groups
  mdat <- dplyr::inner_join(sam_tab %>% 
                              tibble::rownames_to_column("TempRowNames") %>%
                              dplyr::select(dplyr::all_of(c("TempRowNames", "Compvar"))),
                            count_tab %>% 
                              t() %>% data.frame() %>%
                              tibble::rownames_to_column("TempRowNames"),
                            by = "TempRowNames") %>%
    tibble::column_to_rownames("TempRowNames")
  occ_fun <- function(x){
    length(x[c(which(!is.na(x) & x != 0))])/length(x)
  }
  mdat_zero_occ <- mdat %>% dplyr::group_by(Compvar) %>%
    dplyr::summarise(across(everything(), ~ occ_fun(.))) %>%
    tibble::column_to_rownames("Compvar") %>%
    t() %>% data.frame()
  # change & (and) into | (or): 7/19/2022
  mdat_zero_occ$KEEP <- ifelse(mdat_zero_occ[, 1] > 0.1 | mdat_zero_occ[, 2] > 0.1, TRUE, FALSE)
  nfeature <- rownames(mdat_zero_occ)[mdat_zero_occ$KEEP]
  mdat_remain <- mdat[, colnames(mdat) %in% c("Compvar", nfeature)]
  
  ## Judge whether the data are identical (such as all equal=1)
  idential_res <- apply(mdat_remain[, -1], 2, function(x) {
    length(unique(x)[unique(x) != 0]) != 1
  }) %>%
    data.frame()
  nfeature_idential <- rownames(idential_res)[idential_res$.]
  mdat_remain_v2 <- mdat_remain[, colnames(mdat_remain) %in% c("Compvar", nfeature_idential)]
  
  ## remove value appears to be constant with groups
  ## but the strict filtered criterion would remove too many taxa if the cutoff is 3
  constant_res <- mdat_remain_v2 %>% dplyr::group_by(Compvar) %>%
    dplyr::summarise(across(.cols = everything(), .fns = function(x){length(unique(x)) > 0})) %>%
    tibble::column_to_rownames("Compvar") %>%
    t() %>% data.frame()
  
  nfeature_no_constant <- data.frame()
  for (i in 1:nrow(constant_res)) {
    nfeature_no_constant <- rbind(nfeature_no_constant,
                                  data.frame(FeatureID=rownames(constant_res)[i],
                                             KEEP=all(constant_res[i, 1], constant_res[i, 2])))
  }
  nfeature_no_constant_final <- rownames(idential_res)[nfeature_no_constant$KEEP]
  mdat_remain_final <- mdat_remain[, colnames(mdat_remain) %in% 
                                     c("Compvar", nfeature_no_constant_final)]
  
  ## final tables
  otu_tab_final <- mdat_remain_final %>%
    dplyr::select(-Compvar) %>%
    t() %>% data.frame()
  sam_tab_final <- sam_tab[pmatch(colnames(otu_tab_final), rownames(sam_tab)), , F]
  
  # return results
  res <- list(count = otu_tab_final,
              phen = sam_tab_final,
              nf = NULL,
              lib_size = NULL,
              group_names = group_names)
  
  return(res)
}

calculate_occurrence <- function(profile, metadata, groups) {
  
  if (!all(rownames(metadata) == colnames(profile))) {
    stop("Order of sampleID between colData and proData is wrong please check your inputdata")
  }
  
  res <- apply(profile, 1, function(x, y) {
    dat <- data.frame(value = as.numeric(x), group = y)
    occ <- tapply(dat$value, dat$group, function(x){
      length(x[c(which(!is.na(x) & x != 0))]) / length(x)
    }) %>%
      data.frame() %>% stats::setNames("occ") %>%
      tibble::rownames_to_column("Group")
    occ1 <- occ[occ$Group %in% groups[1], "occ"]
    occ2 <- occ[occ$Group %in% groups[2], "occ"]
    occall <- length(dat$value[c(which(!is.na(dat$value) & dat$value != 0))]) / length(dat$value)
    
    # 100%
    occ1_percentage <- round(occ1, 4) * 100
    occ2_percentage <- round(occ2, 4) * 100
    occall_percentage <- round(occall, 4) * 100
    
    res <- c(occall_percentage, occ1_percentage, occ2_percentage)
    return(res)
  }, metadata$Compvar) %>%
    t() %>% data.frame() %>%
    tibble::rownames_to_column("FeatureID")
  
  colnames(res) <- c("FeatureID",
                     "Occurrence (100%)\n(All)",
                     paste0("Occurrence (100%)\n", groups))
  
  return(res)
}

run_OddRatio <- function(datx, daty, GroupName) {
  
  # glm result for odd ratios 95%CI
  mdat <- dplyr::inner_join(datx %>% tibble::rownames_to_column("TempRowNames") %>%
                              dplyr::select(dplyr::all_of(c("TempRowNames", "Compvar"))),
                            daty %>% t() %>% data.frame() %>%
                              tibble::rownames_to_column("TempRowNames"),
                            by = "TempRowNames") %>%
    tibble::column_to_rownames("TempRowNames")
  
  # Judge whether the data are not 0 per groups
  occ_fun <- function(x){
    length(x[c(which(!is.na(x) & x != 0))]) / length(x)
  }
  mdat_zero_occ <- mdat %>%
    dplyr::group_by(Compvar) %>%
    dplyr::summarise(across(everything(), ~ occ_fun(.))) %>%
    tibble::column_to_rownames("Compvar") %>%
    t() %>% data.frame()
  mdat_zero_occ$KEEP <- ifelse(mdat_zero_occ[, 1] > 0 & mdat_zero_occ[, 2] > 0, TRUE, FALSE)
  nfeature <- rownames(mdat_zero_occ)[mdat_zero_occ$KEEP]
  mdat_remain <- mdat[, colnames(mdat) %in% c("Compvar", nfeature)]
  
  # Judge whether the data are identical (such as all equal=1)
  idential_res <- apply(mdat_remain[, -1], 2, function(x) {
    length(unique(x)[unique(x) != 0]) != 1
  }) %>%
    data.frame()
  nfeature_idential <- rownames(idential_res)[idential_res$.]
  mdat_remain_final <- mdat_remain[, colnames(mdat_remain) %in% c("Compvar", nfeature_idential)]
  
  dat_phe <- mdat_remain_final %>%
    dplyr::select(Compvar) %>%
    dplyr::mutate(Compvar = ifelse(Compvar == GroupName[2], 1, 0))
  dat_prf <- mdat_remain_final %>% dplyr::select(-Compvar)
  
  glmFun <- function(GroupN, MarkerN) {
    
    MarkerN[MarkerN == 0] <- min(MarkerN[MarkerN != 0])
    dat_glm <- data.frame(group = GroupN,
                          marker = scale(MarkerN, center = TRUE, scale = TRUE)) %>%
      na.omit()
    model <- summary(stats::glm(group ~ marker, data = dat_glm,
                                family = binomial(link = "logit")))
    res <- signif(exp(model$coefficients["marker", 1]) +
                    qnorm(c(0.025, 0.5, 0.975)) * model$coefficients["marker", 1], 2)
    
    return(res)
  }
  
  glm_res <- t(apply(dat_prf, 2, function(x, group) {
    res <- glmFun(group, as.numeric(x))
    return(res)
  }, dat_phe$Compvar))
  
  Odd <- glm_res %>% data.frame() %>%
    setNames(c("upper", "expected", "lower")) %>%
    dplyr::mutate("Odds Ratio (95% CI)" = paste0(expected, " (", lower, ";", upper, ")"))
  Odd$FeatureID <- rownames(glm_res)
  
  res_ratain <- Odd[, c(5, 4)]
  
  # drop features
  drop_features <- dplyr::setdiff(rownames(daty), nfeature_idential)
  if (length(drop_features) > 0) {
    res_drop <- data.frame(FeatureID = drop_features,
                           Value = NA) %>%
      stats::setNames(c("FeatureID", "Odds Ratio (95% CI)"))
    
    res <- rbind(res_ratain, res_drop)
  } else {
    res <- res_ratain
  }
  
  return(res)
}

calculate_median_abundance <- function(profile, metadata, groups) {
  
  if (!all(rownames(metadata) == colnames(profile))) {
    stop("Order of sampleID between colData and proData is wrong please check your inputdata")
  }
  
  res <- apply(profile, 1, function(x, y) {
    dat <- data.frame(value = as.numeric(x), group = y)
    mn <- tapply(dat$value, dat$group, median) %>%
      data.frame() %>% setNames("value") %>%
      tibble::rownames_to_column("Group")
    mn1 <- with(mn, mn[Group %in% groups[1], "value"])
    mn2 <- with(mn, mn[Group %in% groups[2], "value"])
    mnall <- median(dat$value)
    
    if (all(mn1 != 0, mn2 != 0)) {
      Log2median <- log2(mn1 / mn2)
    } else {
      Log2median <- NA
    }
    
    res <- c(Log2median, mnall, mn1, mn2)
    return(res)
  }, metadata$Compvar) %>%
    t() %>% data.frame() %>%
    tibble::rownames_to_column("FeatureID")
  
  colnames(res) <- c("FeatureID",
                     paste0("Log2FoldChange (Median)\n", paste(groups, collapse = "_vs_")),
                     "Median Abundance\n(All)",
                     paste0("Median Abundance\n", groups))
  
  return(res)
}

calculate_mean_abundance <- function(profile, metadata, groups) {
  
  if (!all(rownames(metadata) == colnames(profile))) {
    stop("Order of sampleID between colData and proData is wrong please check your inputdata")
  }
  
  res <- apply(profile, 1, function(x, y) {
    dat <- data.frame(value = as.numeric(x), group = y)
    mn <- tapply(dat$value, dat$group, mean) %>%
      data.frame() %>% stats::setNames("value") %>%
      tibble::rownames_to_column("Group")
    mn1 <- with(mn, mn[Group %in% groups[1], "value"])
    mn2 <- with(mn, mn[Group %in% groups[2], "value"])
    mnall <- mean(dat$value)
    
    if (all(mn1 != 0, mn2 != 0)) {
      Log2mean <- log2(mn1 / mn2)
    } else {
      Log2mean <- NA
    }
    
    res <- c(Log2mean, mnall, mn1, mn2)
    return(res)
  }, metadata$Compvar) %>%
    t() %>% data.frame() %>%
    tibble::rownames_to_column("FeatureID")
  
  colnames(res) <- c("FeatureID",
                     paste0("Log2FoldChange (Mean)\n", paste(groups, collapse = "_vs_")),
                     "Mean Abundance\n(All)",
                     paste0("Mean Abundance\n", groups))
  
  return(res)
}

create_contrast <- function(groups, contrast = NULL) {
  if (!is.factor(groups)) {
    groups <- factor(groups)
  }
  lvl <- levels(groups)
  n_lvl <- length(lvl)
  if (n_lvl < 2) {
    stop("Differential analysis requires at least two groups.")
  }
  
  if (n_lvl == 2) { # two groups
    if (!is.null(contrast)) {
      warning(
        "`contrast` is ignored, you do not need to set it",
        call. = FALSE
      )
    }
    design <- rep(0, n_lvl)
    design[1] <- -1
    design[2] <- 1
  }
  
  return(design)
}

上述函数可以存到本地utilities.R脚本，可以通过source(“utilities.R”)加载函数。

11.3 导入数据

ExpressionSet来自于章节 9


ExprSet <- readRDS("./data/result/ExpSetObject/MergeExpSet_VoomSNM_VoomSNM_LIRI-JP_TCGA-LIHC.RDS")

11.4 差异分析函数

#| warning: false
#| message: false

run_limma_voom <- function(
    object = NULL,
    data_counts = NULL,
    data_metadata = NULL,
    group,
    group_names = NULL,
    voom_span = 0.5,
    p_adjust = "BH",
    qvalue_cutoff = 0.05) {

  # extract tables
  dat_tables <- determine_tables(
    object = object,
    data_counts = data_counts,
    data_metadata = data_metadata,
    group = group,
    group_names = group_names,
    p_adjust = p_adjust)

  otu_tab <- dat_tables$count
  sam_tab <- dat_tables$phen
  nf <- dat_tables$nf
  lib_size <- colSums(dat_tables$count)
  group_names <- dat_tables$group_names

  # calculate occurrence for group each taxa
  occurrence_taxa <- calculate_occurrence(otu_tab, sam_tab, group_names)
  # calculate median abundance per group for enriched directory
  median_abundance <- calculate_median_abundance(otu_tab, sam_tab, group_names)
  # calculate mean abundance per group
  mean_abundance <- calculate_mean_abundance(otu_tab, sam_tab, group_names)
  # 95% CI Odd Ratio
  odd_Ratio <- run_OddRatio(sam_tab, otu_tab, group_names)

  # building contrast object
  lvl <- levels(sam_tab$Compvar)
  n_lvl <- length(lvl)
  groups <- sam_tab$Compvar
  contrast <- create_contrast(groups, group_names)

  # design matrix
  design <- model.matrix(~ 0 + groups)

  # DA analysis: 
  res_temp <- data.frame(
      FeatureID = rownames(test_df),
      Enrichment = enrich_group,
      EffectSize = ef,
      logFC = test_df$logFC,
      Pvalue = test_df$P.Value,
      AdjustedPvalue = test_df$adj.P.Val) %>%
    dplyr::inner_join(median_abundance, by = "FeatureID") %>%
    dplyr::inner_join(mean_abundance, by = "FeatureID") %>%
    dplyr::inner_join(occurrence_taxa, by = "FeatureID")

  res_temp$Enrichment <- ifelse(res_temp$AdjustedPvalue < qvalue_cutoff, 
                                res_temp$Enrichment, "Nonsignif")

  # Number of Group
  dat_status <- table(sam_tab$Compvar)
  dat_status_number <- as.numeric(dat_status)
  dat_status_name <- names(dat_status)
  res_temp$Block <- paste(paste(dat_status_number[1], dat_status_name[1], sep = "_"),
                       "vs",
                       paste(dat_status_number[2], dat_status_name[2], sep = "_"))

  if (nrow(odd_Ratio) != 0) {
    res_final <- res_temp[, c(1, 18, 2:17)] %>%
      dplyr::inner_join(odd_Ratio, by = "FeatureID")
  } else {
    res_final <- res_temp[, c(1, 18, 2:17)]
  }

  return(res_final)
}

11.5 运行差异分析

设置对应参数，运行run_limma_voom


if (!dir.exists("./data/result/DA/")) {
  dir.create("./data/result/DA/", recursive = TRUE)
}

if (file.exists("./data/result/DA/HCC_Early_vs_Late_limma.csv")) {
  Early_vs_Late <- readr::read_csv("./data/result/DA/HCC_Early_vs_Late_limma.csv")
} else {
  Early_vs_Late <- run_limma_voom(
    object = ExprSet,
    group = "Group",
    group_names = c("Early Stage", "Late Stage"))  
  write.csv(Early_vs_Late, "./data/result/DA/HCC_Early_vs_Late_limma.csv", row.names = F)  
}

head(Early_vs_Late)

结果：输出结果包含了19列。它们分别是：

表 11.1: 差异分析limma的结果说明

列名	中文名称	含义
FeatureID	特征名称	基因名
Block	区块	分组检验数目
Enrichment	富集方向	简单的富集方向
EffectSize	组间效应值	两组均值差异比值
logFC	组间log倍数	limma的logFC
Pvalue	检验P值	limma的pvalue
AdjustedPvalue	检验BH值	limma的多重检验校正后的pvalue
Log2FoldChange (Median) Early Stage_vs_Late Stage	中位数的log2倍数
Median Abundance (All)	中位数丰度
Median Abundance Early Stage	中位数Early组丰度
Median Abundance Late Stage	中位数Late组丰度
Log2FoldChange (Mean) Early Stage_vs_Late Stage	组间log倍数
Mean Abundance (All)	平均值丰度
Mean Abundance Early Stage	平均值Early组丰度
Mean Abundance Late Stage	平均值Late组丰度
Occurrence (100%) (All)	出现率
Occurrence (100%) Early Stage	Early组出现率
Occurrence (100%) Late Stage	Late组出现率
Odds Ratio (95% CI)	比值比	95%置信区间比值比

对小节 11.5 上述结果进行过滤，挑选显著差异的特征

qvalue < 0.05;
|log2foldchange| >= 0.5;
|log2foldchange| <= 50;
prevalence > 50


qval_cutoff <- 0.05
lgfc_cutoff <- 0.5
lgfc_max <- 50
prev_cutoff <- 50

Early_vs_Late_signif <- Early_vs_Late %>%
  dplyr::filter(AdjustedPvalue < qval_cutoff) %>%
  dplyr::filter(abs(logFC) >= lgfc_cutoff) %>%
  dplyr::filter(abs(logFC) <= lgfc_max) %>%
  dplyr::filter(`Occurrence (100%)\nEarly Stage` > prev_cutoff) %>%
  dplyr::filter(`Occurrence (100%)\nLate Stage` > prev_cutoff)
  # dplyr::filter(`Occurrence..100...Early.Stage` > prev_cutoff) %>%
  # dplyr::filter(`Occurrence..100...Late.Stage` > prev_cutoff)  

table(Early_vs_Late_signif$Enrichment)

结果：分别获得富集在Early和Late分组的差异基因，这些差异基因将用于后续特征选择。

11.6 输出结果


if (!dir.exists("./data/result/DA/")) {
  dir.create("./data/result/DA/", recursive = TRUE)
}

write.csv(Early_vs_Late_signif, "./data/result/DA/HCC_Early_vs_Late_limma_select.csv", row.names = F)

11.7 总结

在基因标记的探索和识别过程中，确保选取合适的差异分析方法和评估差异基因的指标是至关重要的步骤。为了有效识别出具有统计学显著性的基因差异，本节研究采用了广泛认可和应用的limma差异分析包。

系统信息

sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Asia/Shanghai
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices datasets  utils     methods   base     

other attached packages:
 [1] limma_3.58.1        Biobase_2.62.0      BiocGenerics_0.48.1
 [4] data.table_1.15.4   lubridate_1.9.3     forcats_1.0.0      
 [7] stringr_1.5.1       dplyr_1.1.4         purrr_1.0.2        
[10] readr_2.1.5         tidyr_1.3.1         tibble_3.2.1       
[13] ggplot2_3.5.1       tidyverse_2.0.0    

loaded via a namespace (and not attached):
 [1] gtable_0.3.5        jsonlite_1.8.8      compiler_4.3.3     
 [4] BiocManager_1.30.23 renv_1.0.0          tidyselect_1.2.1   
 [7] scales_1.3.0        statmod_1.5.0       yaml_2.3.8         
[10] fastmap_1.1.1       R6_2.5.1            generics_0.1.3     
[13] knitr_1.46          htmlwidgets_1.6.4   munsell_0.5.1      
[16] tzdb_0.4.0          pillar_1.9.0        rlang_1.1.3        
[19] utf8_1.2.4          stringi_1.8.4       xfun_0.43          
[22] timechange_0.3.0    cli_3.6.2           withr_3.0.0        
[25] magrittr_2.0.3      digest_0.6.35       grid_4.3.3         
[28] rstudioapi_0.16.0   hms_1.1.3           lifecycle_1.0.4    
[31] vctrs_0.6.5         evaluate_0.23       glue_1.7.0         
[34] fansi_1.0.6         colorspace_2.1-0    rmarkdown_2.26     
[37] tools_4.3.3         pkgconfig_2.0.3     htmltools_0.5.8.1