library(tidyverse)
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(massdatabase)
library(enrichplot)
rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)
grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#8AC786", "#B897CA")
grp_shapes <- c(15, 16)
15 KEGG通路富集分析
KEGG pathway是京都基因与基因组百科全书(Kyoto Encyclopedia of Genes and Genomes,简称KEGG)中的一个子数据库,它包括了代谢、调控、通路、生化、疾病、药物等相关的分子相互作用和关系网络。
KEGG通路富集分析是一种用于分析高通量实验数据的方法,其目标是确定在实验中观察到的基因或其他实体是否集中在特定的KEGG通路中,以便推断这些通路在实验条件下是否显著富集。在生物学研究中,通过通路富集分析可以了解一组基因是否在某些已知的代谢通路、信号通路或细胞过程中显著富集,从而揭示这些基因在生物体内可能的功能和相互作用。这种分析方法通常利用KEGG等数据库提供的已知生物通路信息,结合超几何分布等方法计算给定基因集在某个通路上的P值,以判断该基因集在该通路上的富集程度是否显著。
15.1 加载R包
使用rm(list = ls())
来清空环境中的所有变量。
15.2 导入数据
15.3 所需函数
get_KEGG_category
获得KEGG pathway的不同层级关系表;get_GSEA
基于GSEA方法功能富集分析;使用
clusterProfiler::GSEA
(Yu 等 2012)做GSEA分析;使用
msigdbr::msigdbr
(Dolgalev 2020)拿到基因和通路关系对应表。下载KEGG关系表的函数,它们均来自于
massdatabase
包(Shen 等 2022)。
get_KEGG_category <- function(
pathdir,
sleeptime = 2,
organ = "hsa") {
KEGG_pathway_database <- massdatabase::convert_kegg2metpath(
data = KEGG_data,
path = pathdir)
res <- KEGG_relation %>%
dplyr::group_by(pathID) %>%
dplyr::mutate(category = unlist(strsplit(pathClass, ";\\s+"))[1],
subcategory = unlist(strsplit(pathClass, ";\\s+"))[2]) %>%
dplyr::ungroup() %>%
dplyr::distinct() %>%
dplyr::select(pathID, category, subcategory, pathName, Describtion)
return(res)
}
get_GSEA <- function(
genelist,
showcase = 10,
species_type = c("Mus musculus", "Homo sapiens"),
pathway_type = c("C2", "CP:KEGG"),
group_names = grp_names) {
# KEGG table
if (!dir.exists("./data/result/Function")) {
dir.create("./data/result/Function", recursive = TRUE)
}
hsa_kegg_tab <- get_KEGG_category(
pathdir = "./data/result/Function/KEGG/hsa_KEGG",
sleeptime = 2,
organ = "hsa") %>%
dplyr::filter(!subcategory %in%
c("Drug resistance: antineoplastic", "Cardiovascular disease",
"Infectious disease: parasitic", "Neurodegenerative disease",
"Aging", "Cellular community - eukaryotes", "Infectious disease: bacterial",
"Cancer: specific types"))
genelist_new <- genelist %>%
dplyr::arrange(desc(logFC))
inputgenes <- genelist_new$logFC
names(inputgenes) <- genelist_new$FeatureID
# download background genes
kegg_df_cln <- kegg_df %>%
dplyr::inner_join(hsa_kegg_tab %>%
dplyr::select(category, subcategory, pathName),
by = c("gs_description" = "pathName"))
# KEGG pathway (geneID)
fit <- clusterProfiler::GSEA(
geneList = inputgenes,
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
minGSSize = 1,
maxGSSize = 500,
TERM2GENE = dplyr::select(
kegg_df_cln,
gs_description,
gene_symbol),
#by = "DOSE",
#nPerm = 1000
by = "fgsea")
fit_result <- fit@result %>%
as.data.frame() %>%
dplyr::mutate(Order = 1:nrow(fit@result)) %>%
dplyr::left_join(kegg_df_cln %>%
dplyr::select(gs_description, category, subcategory) %>%
dplyr::distinct(),
by = c("ID" = "gs_description")) %>%
dplyr::rename(Count = setSize) %>%
dplyr::mutate(enrichmentDir = ifelse(NES > 0, group_names[1], group_names[2])) %>%
dplyr::select(Order, ID, everything())
plotdata <- fit_result %>%
dplyr::filter(!is.na(Description)) %>%
dplyr::group_by(category) %>%
dplyr::slice(1:showcase) %>%
dplyr::ungroup() %>%
dplyr::mutate(qvalue = as.numeric(qvalue),
Count = as.numeric(Count),
enrichmentScore = as.numeric(enrichmentScore)) %>%
dplyr::mutate(ID = gsub("KEGG_", "", ID),
ID = tolower(ID)) %>%
dplyr::arrange(desc(category), enrichmentScore)
plotdata$Description <- factor(plotdata$Description,
levels = unique(plotdata$Description))
plotdata$category <- factor(plotdata$category)
res <- list(fit = fit,
result = fit_result,
pl = pl)
return(res)
}
15.4 运行
- 使用GSEA方法计算所有差异基因的富集KEGG pathway结果
if (file.exists("./data/result/Function/GSEA_res.RDS")) {
GSEA_result <- readRDS("./data/result/Function/GSEA_res.RDS")
} else {
GSEA_result <- get_GSEA(
genelist = da_res,
showcase = 10,
species_type = "Homo sapiens",
pathway_type = c("C2", "CP:KEGG"))
saveRDS(GSEA_result, "./data/result/Function/GSEA_res.RDS", compress = TRUE)
}
GSEA_result$pl
结果:从图中,可以获得以下比较有用的知识:细胞周期(Cell cycle)和p53信号通路(p53 signaling pathway)在肝癌HCC中可能具有显著的影响。这两个过程与细胞的生长、增殖和凋亡紧密相关,对癌症的发生和发展至关重要。
- 在Early分期富集的KEGG pathway
# Tryptophan metabolism: 14
# Butanoate metabolism: 18
# Nitrogen metabolism: 44
# One carbon pool by folate: 57
enrich_early <- enrichplot::gseaplot2(
GSEA_result$fit,
geneSetID = c(14, 18, 44, 57),
#pvalue_table = TRUE,
color = c("#D51F26", "#272E6A", "#208A42", "#89288F"),
ES_geom = "line",
title = "Enriched in the early stage of HCC")
enrich_early
- 在Late分期富集的KEGG pathway
# Cell cycle: 11
# Mismatch repair: 30
# p53 signaling pathway: 36
# ECM-receptor interaction: 42
enrich_late <- enrichplot::gseaplot2(
GSEA_result$fit,
geneSetID = c(11, 30, 36, 42),
#pvalue_table = TRUE,
color = c("#faa818", "#41a30d", "#fbdf72", "#367d7d"),
ES_geom = "line",
title = "Enriched in the late stage of HCC")
enrich_late
15.5 其他可视化方法
通过GseaVis(Zhang 2022)R包在特定富集通路添加基因标签,展示表达状态。
library(GseaVis)
p53_gene <- c("SFN", "SERPINE1", "CDK1", "CHEK1", "CCNB2",
"CCNB1", "IGFBP3", "GTSE1", "CDK4", "CCNE1")
gseaNb(object = GSEA_result$fit,
geneSetID = 'p53 signaling pathway',
addGene = p53_gene,
# newGsea = TRUE,
# subPlot = 2,
addPval = TRUE,
pvalX = 0.6, pvalY = 0.8,
pCol = "black",
newHtCol = c("blue","white", "red"),
pHjust = 0)
结果:富集结果增加了基因标签。
15.6 输出结果
if (!dir.exists("./data/result/Figure")) {
dir.create("./data/result/Figure", recursive = TRUE)
}
ggsave("./data/result/Figure/Fig3-B.pdf", GSEA_result$pl, width = 8, height = 5, dpi = 600)
ggsave("./data/result/Figure/Fig3-C.pdf", enrich_early, width = 5, height = 4, dpi = 600)
ggsave("./data/result/Figure/Fig3-D.pdf", enrich_late, width = 5, height = 4, dpi = 600)
15.7 总结
KEGG通路富集分析结果揭示在不同HCC癌症分期中,富集得到的通路也不一样。比如p53 signaling pathway
只富集在癌症后期,而富集在前期的是Tryptophan metabolism
等。
系统信息
R version 4.3.3 (2024-02-29)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.2
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: Asia/Shanghai
tzcode source: internal
attached base packages:
[1] stats4 stats graphics grDevices datasets utils methods
[8] base
other attached packages:
[1] enrichplot_1.22.0 MSnbase_2.28.1 ProtGenerics_1.34.0
[4] mzR_2.36.0 Rcpp_1.0.12 massdataset_1.0.29
[7] metid_1.2.30 metpath_1.0.8 magrittr_2.0.3
[10] masstools_1.0.13 massdatabase_1.0.10 msigdbr_7.5.1
[13] org.Hs.eg.db_3.18.0 AnnotationDbi_1.66.0 IRanges_2.36.0
[16] S4Vectors_0.40.2 Biobase_2.62.0 BiocGenerics_0.48.1
[19] clusterProfiler_4.10.1 lubridate_1.9.3 forcats_1.0.0
[22] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2
[25] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
[28] ggplot2_3.5.1 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] splines_4.3.3 bitops_1.0-7
[3] ggplotify_0.1.2 cellranger_1.1.0
[5] polyclip_1.10-6 preprocessCore_1.64.0
[7] XML_3.99-0.16.1 lifecycle_1.0.4
[9] doParallel_1.0.17 globals_0.16.3
[11] lattice_0.22-6 MASS_7.3-60.0.1
[13] plotly_4.10.4 openxlsx_4.2.5.2
[15] limma_3.58.1 rmarkdown_2.26
[17] yaml_2.3.8 remotes_2.5.0
[19] zip_2.3.1 cowplot_1.1.3
[21] MsCoreUtils_1.14.1 pbapply_1.7-2
[23] DBI_1.2.2 RColorBrewer_1.1-3
[25] abind_1.4-5 zlibbioc_1.48.2
[27] rvest_1.0.4 GenomicRanges_1.54.1
[29] ggraph_2.2.1 RCurl_1.98-1.14
[31] yulab.utils_0.1.4 tweenr_2.0.3
[33] circlize_0.4.16 GenomeInfoDbData_1.2.11
[35] ggrepel_0.9.5 listenv_0.9.1
[37] tidytree_0.4.6 parallelly_1.37.1
[39] DelayedArray_0.28.0 ncdf4_1.22
[41] codetools_0.2-19 xml2_1.3.6
[43] DOSE_3.28.2 ggforce_0.4.2
[45] shape_1.4.6.1 tidyselect_1.2.1
[47] aplot_0.2.2 farver_2.1.1
[49] viridis_0.6.5 matrixStats_1.3.0
[51] jsonlite_1.8.8 GetoptLong_1.0.5
[53] tidygraph_1.3.1 iterators_1.0.14
[55] foreach_1.5.2 progress_1.2.3
[57] tools_4.3.3 treeio_1.26.0
[59] stringdist_0.9.12 glue_1.7.0
[61] SparseArray_1.2.4 gridExtra_2.3
[63] xfun_0.43 MatrixGenerics_1.14.0
[65] qvalue_2.34.0 GenomeInfoDb_1.38.8
[67] withr_3.0.0 BiocManager_1.30.23
[69] fastmap_1.1.1 fansi_1.0.6
[71] digest_0.6.35 timechange_0.3.0
[73] R6_2.5.1 gridGraphics_0.5-1
[75] colorspace_2.1-0 GO.db_3.19.1
[77] RSQLite_2.3.6 utf8_1.2.4
[79] generics_0.1.3 renv_1.0.0
[81] data.table_1.15.4 prettyunits_1.2.0
[83] S4Arrays_1.2.1 graphlayouts_1.1.1
[85] httr_1.4.7 htmlwidgets_1.6.4
[87] scatterpie_0.2.2 pkgconfig_2.0.3
[89] gtable_0.3.5 blob_1.2.4
[91] ComplexHeatmap_2.18.0 impute_1.76.0
[93] XVector_0.42.0 furrr_0.3.1
[95] shadowtext_0.1.3 htmltools_0.5.8.1
[97] fgsea_1.28.0 MALDIquant_1.22.2
[99] clue_0.3-65 scales_1.3.0
[101] png_0.1-8 ggfun_0.1.4
[103] knitr_1.46 rstudioapi_0.16.0
[105] rjson_0.2.21 tzdb_0.4.0
[107] reshape2_1.4.4 nlme_3.1-164
[109] curl_5.2.1 cachem_1.0.8
[111] GlobalOptions_0.1.2 parallel_4.3.3
[113] HDO.db_0.99.1 mzID_1.40.0
[115] vsn_3.70.0 pillar_1.9.0
[117] grid_4.3.3 vctrs_0.6.5
[119] pcaMethods_1.94.0 cluster_2.1.6
[121] evaluate_0.23 cli_3.6.2
[123] compiler_4.3.3 rlang_1.1.3
[125] crayon_1.5.2 Rdisop_1.62.0
[127] affy_1.80.0 plyr_1.8.9
[129] fs_1.6.4 stringi_1.8.4
[131] viridisLite_0.4.2 BiocParallel_1.36.0
[133] babelgene_22.9 munsell_0.5.1
[135] Biostrings_2.70.3 lazyeval_0.2.2
[137] GOSemSim_2.28.1 Matrix_1.6-5
[139] hms_1.1.3 patchwork_1.2.0
[141] bit64_4.0.5 future_1.33.2
[143] KEGGREST_1.42.0 statmod_1.5.0
[145] SummarizedExperiment_1.32.0 igraph_2.0.3
[147] memoise_2.0.1 affyio_1.72.0
[149] ggtree_3.10.1 fastmatch_1.1-4
[151] bit_4.0.5 readxl_1.4.3
[153] ape_5.8 gson_0.1.0