16  ssGSEA富集分析

ssGSEA(单样本基因集富集分析)是一种用于研究基因组学数据的分析方法。ssGSEA通过将单个样本的基因表达数据与多个基因集进行比较,来揭示不同基因集在个体中的相对活性水平。它可以将基因表达数据映射到已知的基因集上,并计算每个基因集的富集程度,从而提供一种解释个体不同表型基础的新方法。这种方法不需要进行样本间比较,可以直接对单个样本进行基因集富集分析。

16.1 加载R包

使用rm(list = ls())来清空环境中的所有变量。

library(tidyverse)
library(msigdbr)
library(massdatabase)
library(GSVA)
library(ComplexHeatmap)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#8AC786", "#B897CA")
grp_shapes <- c(15, 16)

16.2 导入数据


ExprSet <- readRDS("./data/result/ExpSetObject/MergeExpSet_VoomSNM_VoomSNM_LIRI-JP_TCGA-LIHC.RDS")

16.3 所需函数


get_KEGG_category <- function(
    pathdir,
    sleeptime = 2,
    organ = "hsa") {
  
  KEGG_pathway_database <- massdatabase::convert_kegg2metpath(
    data = KEGG_data, 
    path = pathdir)    
  
  res <- KEGG_relation %>%
    dplyr::group_by(pathID) %>%
    dplyr::mutate(category = unlist(strsplit(pathClass, ";\\s+"))[1],
                  subcategory = unlist(strsplit(pathClass, ";\\s+"))[2]) %>%
    dplyr::ungroup() %>%
    dplyr::distinct() %>%
    dplyr::select(pathID, category, subcategory, pathName, Describtion)
  
  return(res)
}

get_GSVA <- function(
    exprset,
    method = c("gsva", "ssgsea"),
    showcase = 10,
    species_type = c("Mus musculus", "Homo sapiens", "zscore", "plage"),
    pathway_type = c("C2", "CP:KEGG"),
    kegg_relation = hsa_KEGG_table) {
  
  # download background genes
  kegg_df_list <- base::split(
    kegg_df$gene_symbol,
    kegg_df$gs_name)
  
  # profile
  phenotype <- Biobase::pData(exprset) %>%
    as.data.frame()
  profile <- Biobase::exprs(exprset) %>%
    as.data.frame() %>%
    as.matrix()
  
  # KEGG pathway (geneID)
  fit <- GSVA::gsva(
    expr = profile,
    gset.idx.list = kegg_df_list,
    method = method,
    kcdf = "Gaussian",
    min.sz = 15,
    max.sz = 500,
    mx.diff = TRUE,
    verbose = FALSE)
    
  fit_result <- phenotype %>%
    dplyr::inner_join(fit %>%
        t() %>%
        as.data.frame() %>%
        tibble::rownames_to_column("SampleID"),
                      by = c("SampleID"))
  
  fit_result_plot <- fit_result %>%
    # dplyr::group_by(ProjectID, Group) %>%
    # dplyr::slice(1:20) %>%
    # dplyr::ungroup() %>%
    tibble::column_to_rownames("SampleID") %>%
    dplyr::arrange(Group, Tumour_Stage)
  
  pl_temp <- ComplexHeatmap::Heatmap(
    mat = plotdata,
    top_annotation = top_anno,
    left_annotation = left_anno,
    cluster_columns = FALSE, # TRUE
    cluster_rows = TRUE,
    show_row_names = TRUE,
    show_column_names = FALSE,
    heatmap_legend_param = list(
      legend_direction = "horizontal", 
      legend_width = unit(6, "cm"),
      title = "Relative Abundance")
  )
  
  pl <- draw(pl_temp, 
             heatmap_legend_side = "left", 
             annotation_legend_side = "bottom")
  
  res <- list(fit = fit,
              result = fit_result,
              pl = pl)

  return(res)
}

16.4 运行

  • KEGG关系表
hsa_KEGG_table <- get_KEGG_category(
  pathdir = "./data/result/Function/KEGG/hsa_KEGG",
  sleeptime = 2,
  organ = "hsa") %>%
  dplyr::filter(!subcategory %in% 
       c("Drug resistance: antineoplastic", "Cardiovascular disease",
         "Infectious disease: parasitic", "Neurodegenerative disease",
         "Aging", "Cellular community - eukaryotes", "Infectious disease: bacterial",
         "Cancer: specific types", "Immune disease"))

head(hsa_KEGG_table)
  • 使用ssGSEA方法计算KEGG pathway在组间差异结果

GSVA_result <- get_GSVA(
  exprset = ExprSet,
  method = "gsva",
  showcase = 20,
  species_type = "Homo sapiens",
  pathway_type = c("C2", "CP:KEGG"),
  kegg_relation = hsa_KEGG_table)

结果:从图中没有看到在Group组间非常明显的KEGG通路,但能看到Arginine and proline metabolism相对Late Stage而言,Early Stage的颜色偏向红色也即是丰度更高一些。

16.5 输出结果


if (!dir.exists("./data/result/Figure")) {
  dir.create("./data/result/Figure", recursive = TRUE)
}

pdf("./data/result/Figure/SFig2.pdf", width = 12, height = 6)
print(GSVA_result$pl)
dev.off()

16.6 总结

在生物学和医学研究中,为了深入了解细胞或组织在特定条件下的功能状态,单样本富集分析(ssGSEA)成为了一种强有力的工具。ssGSEA特别适用于单个样本的富集分析,通过预先定义的参考基因集合(如脂肪酸代谢、糖代谢以及免疫等相关基因集),将基因表达谱数据与这些集合进行映射,进而计算出参考基因集在单样本中的功能表达值。这种方法为研究者提供了在单个样本水平上评估特定生物功能的可能性,对于深入理解生物学过程具有重要意义。

系统信息
sessionInfo()
R version 4.3.3 (2024-02-29)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Asia/Shanghai
tzcode source: internal

attached base packages:
[1] grid      stats4    stats     graphics  grDevices datasets  utils    
[8] methods   base     

other attached packages:
 [1] ComplexHeatmap_2.18.0 GSVA_1.50.5           MSnbase_2.28.1       
 [4] ProtGenerics_1.34.0   S4Vectors_0.40.2      mzR_2.36.0           
 [7] Rcpp_1.0.12           Biobase_2.62.0        BiocGenerics_0.48.1  
[10] massdataset_1.0.29    metid_1.2.30          metpath_1.0.8        
[13] magrittr_2.0.3        masstools_1.0.13      massdatabase_1.0.10  
[16] msigdbr_7.5.1         lubridate_1.9.3       forcats_1.0.0        
[19] stringr_1.5.1         dplyr_1.1.4           purrr_1.0.2          
[22] readr_2.1.5           tidyr_1.3.1           tibble_3.2.1         
[25] ggplot2_3.5.1         tidyverse_2.0.0      

loaded via a namespace (and not attached):
  [1] bitops_1.0-7                cellranger_1.1.0           
  [3] polyclip_1.10-6             preprocessCore_1.64.0      
  [5] graph_1.80.0                XML_3.99-0.16.1            
  [7] lifecycle_1.0.4             doParallel_1.0.17          
  [9] globals_0.16.3              lattice_0.22-6             
 [11] MASS_7.3-60.0.1             openxlsx_4.2.5.2           
 [13] limma_3.58.1                plotly_4.10.4              
 [15] rmarkdown_2.26              yaml_2.3.8                 
 [17] remotes_2.5.0               zip_2.3.1                  
 [19] MsCoreUtils_1.14.1          pbapply_1.7-2              
 [21] DBI_1.2.2                   RColorBrewer_1.1-3         
 [23] abind_1.4-5                 zlibbioc_1.48.2            
 [25] rvest_1.0.4                 GenomicRanges_1.54.1       
 [27] ggraph_2.2.1                RCurl_1.98-1.14            
 [29] tweenr_2.0.3                circlize_0.4.16            
 [31] GenomeInfoDbData_1.2.11     IRanges_2.36.0             
 [33] ggrepel_0.9.5               irlba_2.3.5.1              
 [35] listenv_0.9.1               annotate_1.80.0            
 [37] parallelly_1.37.1           DelayedMatrixStats_1.24.0  
 [39] ncdf4_1.22                  codetools_0.2-19           
 [41] DelayedArray_0.28.0         xml2_1.3.6                 
 [43] ggforce_0.4.2               tidyselect_1.2.1           
 [45] shape_1.4.6.1               farver_2.1.1               
 [47] ScaledMatrix_1.10.0         viridis_0.6.5              
 [49] matrixStats_1.3.0           jsonlite_1.8.8             
 [51] GetoptLong_1.0.5            tidygraph_1.3.1            
 [53] iterators_1.0.14            foreach_1.5.2              
 [55] tools_4.3.3                 progress_1.2.3             
 [57] stringdist_0.9.12           glue_1.7.0                 
 [59] gridExtra_2.3               SparseArray_1.2.4          
 [61] xfun_0.43                   MatrixGenerics_1.14.0      
 [63] GenomeInfoDb_1.38.8         HDF5Array_1.30.1           
 [65] withr_3.0.0                 BiocManager_1.30.23        
 [67] fastmap_1.1.1               rhdf5filters_1.14.1        
 [69] fansi_1.0.6                 rsvd_1.0.5                 
 [71] digest_0.6.35               timechange_0.3.0           
 [73] R6_2.5.1                    colorspace_2.1-0           
 [75] RSQLite_2.3.6               utf8_1.2.4                 
 [77] generics_0.1.3              renv_1.0.0                 
 [79] data.table_1.15.4           prettyunits_1.2.0          
 [81] graphlayouts_1.1.1          httr_1.4.7                 
 [83] htmlwidgets_1.6.4           S4Arrays_1.2.1             
 [85] pkgconfig_2.0.3             gtable_0.3.5               
 [87] blob_1.2.4                  impute_1.76.0              
 [89] SingleCellExperiment_1.24.0 XVector_0.42.0             
 [91] furrr_0.3.1                 htmltools_0.5.8.1          
 [93] MALDIquant_1.22.2           GSEABase_1.64.0            
 [95] clue_0.3-65                 scales_1.3.0               
 [97] png_0.1-8                   knitr_1.46                 
 [99] rstudioapi_0.16.0           tzdb_0.4.0                 
[101] rjson_0.2.21                curl_5.2.1                 
[103] rhdf5_2.46.1                cachem_1.0.8               
[105] GlobalOptions_0.1.2         parallel_4.3.3             
[107] AnnotationDbi_1.66.0        mzID_1.40.0                
[109] vsn_3.70.0                  pillar_1.9.0               
[111] vctrs_0.6.5                 pcaMethods_1.94.0          
[113] BiocSingular_1.18.0         beachmat_2.18.1            
[115] xtable_1.8-4                cluster_2.1.6              
[117] evaluate_0.23               cli_3.6.2                  
[119] compiler_4.3.3              rlang_1.1.3                
[121] crayon_1.5.2                Rdisop_1.62.0              
[123] affy_1.80.0                 plyr_1.8.9                 
[125] stringi_1.8.4               viridisLite_0.4.2          
[127] BiocParallel_1.36.0         babelgene_22.9             
[129] munsell_0.5.1               Biostrings_2.70.3          
[131] lazyeval_0.2.2              Matrix_1.6-5               
[133] hms_1.1.3                   sparseMatrixStats_1.14.0   
[135] bit64_4.0.5                 future_1.33.2              
[137] Rhdf5lib_1.24.2             KEGGREST_1.42.0            
[139] statmod_1.5.0               SummarizedExperiment_1.32.0
[141] igraph_2.0.3                memoise_2.0.1              
[143] affyio_1.72.0               bit_4.0.5                  
[145] readxl_1.4.3