15 KEGG通路富集分析

KEGG pathway是京都基因与基因组百科全书（Kyoto Encyclopedia of Genes and Genomes，简称KEGG）中的一个子数据库，它包括了代谢、调控、通路、生化、疾病、药物等相关的分子相互作用和关系网络。

KEGG通路富集分析是一种用于分析高通量实验数据的方法，其目标是确定在实验中观察到的基因或其他实体是否集中在特定的KEGG通路中，以便推断这些通路在实验条件下是否显著富集。在生物学研究中，通过通路富集分析可以了解一组基因是否在某些已知的代谢通路、信号通路或细胞过程中显著富集，从而揭示这些基因在生物体内可能的功能和相互作用。这种分析方法通常利用KEGG等数据库提供的已知生物通路信息，结合超几何分布等方法计算给定基因集在某个通路上的P值，以判断该基因集在该通路上的富集程度是否显著。

15.1 加载R包

使用rm(list = ls())来清空环境中的所有变量。

library(tidyverse)
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(massdatabase)
library(enrichplot)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#8AC786", "#B897CA")
grp_shapes <- c(15, 16)

15.2 导入数据

差异结果来自于章节 11；
ExpressionSet来自于章节 9。


da_res <- read.csv("./data/result/DA/HCC_Early_vs_Late_limma.csv")

ExprSet <- readRDS("./data/result/ExpSetObject/MergeExpSet_VoomSNM_VoomSNM_LIRI-JP_TCGA-LIHC.RDS")

15.3 所需函数

get_KEGG_category获得KEGG pathway的不同层级关系表；
get_GSEA基于GSEA方法功能富集分析；
使用clusterProfiler::GSEA(Yu 等 2012)做GSEA分析；
使用msigdbr::msigdbr(Dolgalev 2020)拿到基因和通路关系对应表。
下载KEGG关系表的函数，它们均来自于massdatabase包(Shen 等 2022)。


get_KEGG_category <- function(
    pathdir,
    sleeptime = 2,
    organ = "hsa") {

  
  KEGG_pathway_database <- massdatabase::convert_kegg2metpath(
    data = KEGG_data, 
    path = pathdir)    
  
  res <- KEGG_relation %>%
    dplyr::group_by(pathID) %>%
    dplyr::mutate(category = unlist(strsplit(pathClass, ";\\s+"))[1],
                  subcategory = unlist(strsplit(pathClass, ";\\s+"))[2]) %>%
    dplyr::ungroup() %>%
    dplyr::distinct() %>%
    dplyr::select(pathID, category, subcategory, pathName, Describtion)
  
  return(res)
}

get_GSEA <- function(
    genelist,
    showcase = 10,
    species_type = c("Mus musculus", "Homo sapiens"),
    pathway_type = c("C2", "CP:KEGG"),
    group_names = grp_names) {
  
  # KEGG table
  if (!dir.exists("./data/result/Function")) {
    dir.create("./data/result/Function", recursive = TRUE)
  }
  hsa_kegg_tab <- get_KEGG_category(
    pathdir = "./data/result/Function/KEGG/hsa_KEGG",
    sleeptime = 2,
    organ = "hsa") %>%
    dplyr::filter(!subcategory %in% 
       c("Drug resistance: antineoplastic", "Cardiovascular disease",
         "Infectious disease: parasitic", "Neurodegenerative disease",
         "Aging", "Cellular community - eukaryotes", "Infectious disease: bacterial",
         "Cancer: specific types"))
  
  genelist_new <- genelist %>%
    dplyr::arrange(desc(logFC))
  
  inputgenes <- genelist_new$logFC
  names(inputgenes) <- genelist_new$FeatureID
  
  # download background genes
  kegg_df_cln <- kegg_df %>%
    dplyr::inner_join(hsa_kegg_tab %>%
                        dplyr::select(category, subcategory, pathName),
                      by = c("gs_description" = "pathName"))
  
  # KEGG pathway (geneID)
  fit <- clusterProfiler::GSEA(
    geneList = inputgenes,
    pvalueCutoff = 0.05,
    pAdjustMethod = "BH",
    minGSSize = 1,
    maxGSSize = 500,
    TERM2GENE = dplyr::select(
            kegg_df_cln,
            gs_description,
            gene_symbol),
    #by = "DOSE",
    #nPerm = 1000
    by = "fgsea")
    
  fit_result <- fit@result %>%
    as.data.frame() %>%
    dplyr::mutate(Order = 1:nrow(fit@result)) %>%
    dplyr::left_join(kegg_df_cln %>%
                      dplyr::select(gs_description, category, subcategory) %>%
                       dplyr::distinct(),
                      by = c("ID" = "gs_description")) %>%
    dplyr::rename(Count = setSize) %>%
    dplyr::mutate(enrichmentDir = ifelse(NES > 0, group_names[1], group_names[2])) %>%
    dplyr::select(Order, ID, everything())
    
  plotdata <- fit_result %>%
      dplyr::filter(!is.na(Description)) %>%
      dplyr::group_by(category) %>%
      dplyr::slice(1:showcase) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(qvalue = as.numeric(qvalue),
                    Count = as.numeric(Count),
                    enrichmentScore = as.numeric(enrichmentScore)) %>%
      dplyr::mutate(ID = gsub("KEGG_", "", ID),
                    ID = tolower(ID)) %>%
      dplyr::arrange(desc(category), enrichmentScore)
    
  plotdata$Description <- factor(plotdata$Description,
                                    levels = unique(plotdata$Description))
  plotdata$category <- factor(plotdata$category)   
  
  res <- list(fit = fit,
              result = fit_result,
              pl = pl)

  return(res)
}

15.4 运行

使用GSEA方法计算所有差异基因的富集KEGG pathway结果


if (file.exists("./data/result/Function/GSEA_res.RDS")) {
  GSEA_result <- readRDS("./data/result/Function/GSEA_res.RDS")  
} else {
  GSEA_result <- get_GSEA(
    genelist = da_res,
    showcase = 10,
    species_type = "Homo sapiens",
    pathway_type = c("C2", "CP:KEGG"))
  
  saveRDS(GSEA_result, "./data/result/Function/GSEA_res.RDS", compress = TRUE)
}

GSEA_result$pl

结果：从图中，可以获得以下比较有用的知识：细胞周期（Cell cycle）和p53信号通路（p53 signaling pathway）在肝癌HCC中可能具有显著的影响。这两个过程与细胞的生长、增殖和凋亡紧密相关，对癌症的发生和发展至关重要。

在Early分期富集的KEGG pathway


# Tryptophan metabolism: 14
# Butanoate metabolism: 18
# Nitrogen metabolism: 44
# One carbon pool by folate: 57

enrich_early <- enrichplot::gseaplot2(
  GSEA_result$fit,
  geneSetID = c(14, 18, 44, 57),
  #pvalue_table = TRUE,
  color = c("#D51F26", "#272E6A", "#208A42", "#89288F"), 
  ES_geom = "line",
  title = "Enriched in the early stage of HCC")

enrich_early

在Late分期富集的KEGG pathway


# Cell cycle: 11
# Mismatch repair: 30
# p53 signaling pathway: 36
# ECM-receptor interaction: 42

enrich_late <- enrichplot::gseaplot2(
  GSEA_result$fit,
  geneSetID = c(11, 30, 36, 42),
  #pvalue_table = TRUE,
  color = c("#faa818", "#41a30d", "#fbdf72", "#367d7d"), 
  ES_geom = "line",
  title = "Enriched in the late stage of HCC")

enrich_late

15.5 其他可视化方法

通过GseaVis(Zhang 2022)R包在特定富集通路添加基因标签，展示表达状态。


library(GseaVis)

p53_gene <- c("SFN", "SERPINE1", "CDK1", "CHEK1", "CCNB2",
            "CCNB1", "IGFBP3", "GTSE1", "CDK4", "CCNE1")

gseaNb(object = GSEA_result$fit,
       geneSetID = 'p53 signaling pathway',
       addGene = p53_gene,       
       # newGsea = TRUE,
       # subPlot = 2,
       addPval = TRUE,
       pvalX = 0.6, pvalY = 0.8,
       pCol = "black",
       newHtCol = c("blue","white", "red"),
       pHjust = 0)

结果：富集结果增加了基因标签。

15.6 输出结果


if (!dir.exists("./data/result/Figure")) {
  dir.create("./data/result/Figure", recursive = TRUE)
}

ggsave("./data/result/Figure/Fig3-B.pdf", GSEA_result$pl, width = 8, height = 5, dpi = 600)
ggsave("./data/result/Figure/Fig3-C.pdf", enrich_early, width = 5, height = 4, dpi = 600)
ggsave("./data/result/Figure/Fig3-D.pdf", enrich_late, width = 5, height = 4, dpi = 600)

15.7 总结

KEGG通路富集分析结果揭示在不同HCC癌症分期中，富集得到的通路也不一样。比如p53 signaling pathway只富集在癌症后期，而富集在前期的是Tryptophan metabolism等。

系统信息

sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Asia/Shanghai
tzcode source: internal

attached base packages:
[1] stats4    stats     graphics  grDevices datasets  utils     methods  
[8] base     

other attached packages:
 [1] enrichplot_1.22.0      MSnbase_2.28.1         ProtGenerics_1.34.0   
 [4] mzR_2.36.0             Rcpp_1.0.12            massdataset_1.0.29    
 [7] metid_1.2.30           metpath_1.0.8          magrittr_2.0.3        
[10] masstools_1.0.13       massdatabase_1.0.10    msigdbr_7.5.1         
[13] org.Hs.eg.db_3.18.0    AnnotationDbi_1.66.0   IRanges_2.36.0        
[16] S4Vectors_0.40.2       Biobase_2.62.0         BiocGenerics_0.48.1   
[19] clusterProfiler_4.10.1 lubridate_1.9.3        forcats_1.0.0         
[22] stringr_1.5.1          dplyr_1.1.4            purrr_1.0.2           
[25] readr_2.1.5            tidyr_1.3.1            tibble_3.2.1          
[28] ggplot2_3.5.1          tidyverse_2.0.0       

loaded via a namespace (and not attached):
  [1] splines_4.3.3               bitops_1.0-7               
  [3] ggplotify_0.1.2             cellranger_1.1.0           
  [5] polyclip_1.10-6             preprocessCore_1.64.0      
  [7] XML_3.99-0.16.1             lifecycle_1.0.4            
  [9] doParallel_1.0.17           globals_0.16.3             
 [11] lattice_0.22-6              MASS_7.3-60.0.1            
 [13] plotly_4.10.4               openxlsx_4.2.5.2           
 [15] limma_3.58.1                rmarkdown_2.26             
 [17] yaml_2.3.8                  remotes_2.5.0              
 [19] zip_2.3.1                   cowplot_1.1.3              
 [21] MsCoreUtils_1.14.1          pbapply_1.7-2              
 [23] DBI_1.2.2                   RColorBrewer_1.1-3         
 [25] abind_1.4-5                 zlibbioc_1.48.2            
 [27] rvest_1.0.4                 GenomicRanges_1.54.1       
 [29] ggraph_2.2.1                RCurl_1.98-1.14            
 [31] yulab.utils_0.1.4           tweenr_2.0.3               
 [33] circlize_0.4.16             GenomeInfoDbData_1.2.11    
 [35] ggrepel_0.9.5               listenv_0.9.1              
 [37] tidytree_0.4.6              parallelly_1.37.1          
 [39] DelayedArray_0.28.0         ncdf4_1.22                 
 [41] codetools_0.2-19            xml2_1.3.6                 
 [43] DOSE_3.28.2                 ggforce_0.4.2              
 [45] shape_1.4.6.1               tidyselect_1.2.1           
 [47] aplot_0.2.2                 farver_2.1.1               
 [49] viridis_0.6.5               matrixStats_1.3.0          
 [51] jsonlite_1.8.8              GetoptLong_1.0.5           
 [53] tidygraph_1.3.1             iterators_1.0.14           
 [55] foreach_1.5.2               progress_1.2.3             
 [57] tools_4.3.3                 treeio_1.26.0              
 [59] stringdist_0.9.12           glue_1.7.0                 
 [61] SparseArray_1.2.4           gridExtra_2.3              
 [63] xfun_0.43                   MatrixGenerics_1.14.0      
 [65] qvalue_2.34.0               GenomeInfoDb_1.38.8        
 [67] withr_3.0.0                 BiocManager_1.30.23        
 [69] fastmap_1.1.1               fansi_1.0.6                
 [71] digest_0.6.35               timechange_0.3.0           
 [73] R6_2.5.1                    gridGraphics_0.5-1         
 [75] colorspace_2.1-0            GO.db_3.19.1               
 [77] RSQLite_2.3.6               utf8_1.2.4                 
 [79] generics_0.1.3              renv_1.0.0                 
 [81] data.table_1.15.4           prettyunits_1.2.0          
 [83] S4Arrays_1.2.1              graphlayouts_1.1.1         
 [85] httr_1.4.7                  htmlwidgets_1.6.4          
 [87] scatterpie_0.2.2            pkgconfig_2.0.3            
 [89] gtable_0.3.5                blob_1.2.4                 
 [91] ComplexHeatmap_2.18.0       impute_1.76.0              
 [93] XVector_0.42.0              furrr_0.3.1                
 [95] shadowtext_0.1.3            htmltools_0.5.8.1          
 [97] fgsea_1.28.0                MALDIquant_1.22.2          
 [99] clue_0.3-65                 scales_1.3.0               
[101] png_0.1-8                   ggfun_0.1.4                
[103] knitr_1.46                  rstudioapi_0.16.0          
[105] rjson_0.2.21                tzdb_0.4.0                 
[107] reshape2_1.4.4              nlme_3.1-164               
[109] curl_5.2.1                  cachem_1.0.8               
[111] GlobalOptions_0.1.2         parallel_4.3.3             
[113] HDO.db_0.99.1               mzID_1.40.0                
[115] vsn_3.70.0                  pillar_1.9.0               
[117] grid_4.3.3                  vctrs_0.6.5                
[119] pcaMethods_1.94.0           cluster_2.1.6              
[121] evaluate_0.23               cli_3.6.2                  
[123] compiler_4.3.3              rlang_1.1.3                
[125] crayon_1.5.2                Rdisop_1.62.0              
[127] affy_1.80.0                 plyr_1.8.9                 
[129] fs_1.6.4                    stringi_1.8.4              
[131] viridisLite_0.4.2           BiocParallel_1.36.0        
[133] babelgene_22.9              munsell_0.5.1              
[135] Biostrings_2.70.3           lazyeval_0.2.2             
[137] GOSemSim_2.28.1             Matrix_1.6-5               
[139] hms_1.1.3                   patchwork_1.2.0            
[141] bit64_4.0.5                 future_1.33.2              
[143] KEGGREST_1.42.0             statmod_1.5.0              
[145] SummarizedExperiment_1.32.0 igraph_2.0.3               
[147] memoise_2.0.1               affyio_1.72.0              
[149] ggtree_3.10.1               fastmatch_1.1-4            
[151] bit_4.0.5                   readxl_1.4.3               
[153] ape_5.8                     gson_0.1.0