14  GO富集分析

GO(Gene Ontology)是一个在生物信息学中广泛使用的概念,用于描述基因和基因产物的功能、它们所处的细胞位置以及它们参与的生物过程。GO项目是一个协作性的国际努力,旨在建立和维护一个适用于各种物种的、结构化的、可控制的词汇表(本体论),用以描述生物学的各个方面。具体来说,GO分为三个主要方面:

  1. 分子功能(Molecular Function, MF):描述基因产物在分子水平上的活动,如催化活性或结合能力。

  2. 生物过程(Biological Process, BP):描述生物学上发生的一系列事件,如细胞生长、代谢过程或信号传导。

  3. 细胞组件(Cellular Component, CC):描述基因产物在细胞内的位置,如细胞核、线粒体或细胞膜。

14.1 加载R包

使用rm(list = ls())来清空环境中的所有变量。

library(tidyverse)
library(clusterProfiler)
library(org.Hs.eg.db)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#8AC786", "#B897CA")
grp_shapes <- c(15, 16)

14.2 导入数据


da_res <- read.csv("./data/result/DA/HCC_Early_vs_Late_limma.csv")

ExprSet <- readRDS("./data/result/ExpSetObject/MergeExpSet_VoomSNM_VoomSNM_LIRI-JP_TCGA-LIHC.RDS")

14.3 所需函数

  • get_DEGs获得不同类型的差异基因列表;

  • get_ORA基于Over-Representative analysis (ORA)方法功能富集分析;

  • 使用clusterProfiler::enrichGO(Yu 等 2012)做GO分析。


get_DEGs <- function(
    dat,
    lg2fc_cutoff = 0.5,
    pval_cutoff = 0.05,
    qval_cutoff = 0.05) {
  
  # group_names
  group_names <- gsub("\\d+_", "", unlist(strsplit(dat$Block[1], " vs ")))
  colnames(dat)[5] <- "lg2fc" 
  colnames(dat)[6] <- "pval"
  colnames(dat)[7] <- "qval"
  
  # enrichment by beta and Pvalue AdjustedPvalue
  dat[which(dat$lg2fc > lg2fc_cutoff & 
              dat$pval < pval_cutoff & 
              dat$qval < qval_cutoff),
      "EnrichedDir"] <- group_names[2]
  dat[which(dat$lg2fc < -lg2fc_cutoff & 
              dat$pval < pval_cutoff & 
              dat$qval < qval_cutoff),
      "EnrichedDir"] <- group_names[1]
  dat[which(abs(dat$lg2fc) <= lg2fc_cutoff | 
              dat$pval >= pval_cutoff |
              dat$qval >= qval_cutoff),
      "EnrichedDir"] <- "Nonsignif"   
  
  # dat status
  dat$EnrichedDir <- factor(dat$EnrichedDir, 
                            levels = c(group_names[2], "Nonsignif", group_names[1]))  
  df_status <- table(dat$EnrichedDir) %>% data.frame() %>%
    stats::setNames(c("Group", "Number"))
  grp1_number <- with(df_status, df_status[Group %in% group_names[1], "Number"])
  grp2_number <- with(df_status, df_status[Group %in% group_names[2], "Number"])
  nsf_number <- with(df_status, df_status[Group %in% "Nonsignif", "Number"])
  legend_label <- c(paste0(group_names[1], " (", grp1_number, ")"),
                    paste0("Nonsignif", " (", nsf_number, ")"),
                    paste0(group_names[2], " (", grp2_number, ")"))
  
  res_up <- dat_signif %>% # enriched in 1st group
    dplyr::filter(EnrichedDir == group_names[1]) %>%
    dplyr::mutate(Status = "Up_regulated")
  
  res_down <- dat_signif %>% # enriched in 2st group
    dplyr::filter(EnrichedDir == group_names[2]) %>%
    dplyr::mutate(Status = "Down_regulated")
  
  res <- list(all = dat_signif,
              up = res_up,
              down = res_down)
  
  return(res)
}

get_ORA <- function(
    genelist,
    genetype = c("all", "up", "down"),
    ORAtype = c("GO", "KEGG"),
    showcase = 10,
    group_names = grp_names) {
  
  res <- list(fit = fit,
              result = fit_result,
              pl = pl)

  return(res)
}

14.4 运行

  • 设置对应参数,运行get_DEGs

df_DEGs <- get_DEGs(dat = da_res)

names(df_DEGs)
  • 使用ORA方法计算所有差异基因的富集GO term结果

All_ORA_GO <- get_ORA(
    genelist = df_DEGs$all,
    genetype = "all",
    ORAtype = "GO",
    showcase = 10)

All_ORA_GO$pl

结果:该图片展示了基于基因本体论(Gene Ontology, GO)的生物信息学分析结果,主要关注细胞过程、分子功能和细胞组件的多个方面。分析结果显示,在肝细胞癌(HCC)相关的基因中,有几个关键的生物过程和分子功能显著富集。

14.5 输出结果


if (!dir.exists("./data/result/Figure")) {
  dir.create("./data/result/Figure", recursive = TRUE)
}

ggsave("./data/result/Figure/Fig3-A.pdf", All_ORA_GO$pl, width = 8, height = 6, dpi = 600)

14.6 总结

通过进行GO(Gene Ontology)富集分析,能够深入理解差异基因在细胞内的功能角色,这一分析为后续关于肝细胞癌(HCC)的研究奠定了坚实的生物学基础。

系统信息
sessionInfo()
R version 4.3.3 (2024-02-29)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Asia/Shanghai
tzcode source: internal

attached base packages:
[1] stats4    stats     graphics  grDevices datasets  utils     methods  
[8] base     

other attached packages:
 [1] org.Hs.eg.db_3.18.0    AnnotationDbi_1.66.0   IRanges_2.36.0        
 [4] S4Vectors_0.40.2       Biobase_2.62.0         BiocGenerics_0.48.1   
 [7] clusterProfiler_4.10.1 lubridate_1.9.3        forcats_1.0.0         
[10] stringr_1.5.1          dplyr_1.1.4            purrr_1.0.2           
[13] readr_2.1.5            tidyr_1.3.1            tibble_3.2.1          
[16] ggplot2_3.5.1          tidyverse_2.0.0       

loaded via a namespace (and not attached):
  [1] DBI_1.2.2               bitops_1.0-7            gson_0.1.0             
  [4] shadowtext_0.1.3        gridExtra_2.3           rlang_1.1.3            
  [7] magrittr_2.0.3          DOSE_3.28.2             compiler_4.3.3         
 [10] RSQLite_2.3.6           png_0.1-8               vctrs_0.6.5            
 [13] reshape2_1.4.4          pkgconfig_2.0.3         crayon_1.5.2           
 [16] fastmap_1.1.1           XVector_0.42.0          ggraph_2.2.1           
 [19] utf8_1.2.4              HDO.db_0.99.1           rmarkdown_2.26         
 [22] tzdb_0.4.0              enrichplot_1.22.0       bit_4.0.5              
 [25] xfun_0.43               zlibbioc_1.48.2         cachem_1.0.8           
 [28] aplot_0.2.2             GenomeInfoDb_1.38.8     jsonlite_1.8.8         
 [31] blob_1.2.4              BiocParallel_1.36.0     tweenr_2.0.3           
 [34] parallel_4.3.3          R6_2.5.1                RColorBrewer_1.1-3     
 [37] stringi_1.8.4           GOSemSim_2.28.1         Rcpp_1.0.12            
 [40] knitr_1.46              Matrix_1.6-5            splines_4.3.3          
 [43] igraph_2.0.3            timechange_0.3.0        tidyselect_1.2.1       
 [46] viridis_0.6.5           qvalue_2.34.0           rstudioapi_0.16.0      
 [49] yaml_2.3.8              codetools_0.2-19        lattice_0.22-6         
 [52] plyr_1.8.9              treeio_1.26.0           withr_3.0.0            
 [55] KEGGREST_1.42.0         evaluate_0.23           gridGraphics_0.5-1     
 [58] scatterpie_0.2.2        polyclip_1.10-6         Biostrings_2.70.3      
 [61] ggtree_3.10.1           pillar_1.9.0            BiocManager_1.30.23    
 [64] renv_1.0.0              ggfun_0.1.4             generics_0.1.3         
 [67] RCurl_1.98-1.14         hms_1.1.3               tidytree_0.4.6         
 [70] munsell_0.5.1           scales_1.3.0            glue_1.7.0             
 [73] lazyeval_0.2.2          tools_4.3.3             data.table_1.15.4      
 [76] fgsea_1.28.0            fs_1.6.4                graphlayouts_1.1.1     
 [79] fastmatch_1.1-4         tidygraph_1.3.1         cowplot_1.1.3          
 [82] grid_4.3.3              ape_5.8                 colorspace_2.1-0       
 [85] nlme_3.1-164            patchwork_1.2.0         GenomeInfoDbData_1.2.11
 [88] ggforce_0.4.2           cli_3.6.2               fansi_1.0.6            
 [91] viridisLite_0.4.2       gtable_0.3.5            yulab.utils_0.1.4      
 [94] digest_0.6.35           ggplotify_0.1.2         ggrepel_0.9.5          
 [97] htmlwidgets_1.6.4       farver_2.1.1            memoise_2.0.1          
[100] htmltools_0.5.8.1       lifecycle_1.0.4         httr_1.4.7             
[103] GO.db_3.19.1            bit64_4.0.5             MASS_7.3-60.0.1