23  验证特征

在成功识别出核心特征之后,为了验证这些特征的有效性和可靠性,我们在发现数据集验证数据集上进行了进一步的评估。这一步骤旨在确保这些特征在不同数据集上的表达值具有一致性,并验证它们对区分肝癌(HCC)早晚期的能力。

1. 表达值一致性评估

2. ROC曲线评估

3. 确定最终候选核心特征

4. 结果解释与后续工作

23.1 加载R包

使用rm(list = ls())来清空环境中的所有变量。

library(tidyverse)
library(data.table)
library(Biobase)
library(pROC)
library(multipleROC)
library(ggdist)
library(gghalves)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#8AC786", "#B897CA")
grp_shapes <- c(15, 16)

23.2 导入数据


common_feature <- read.csv("./data/result/Biomarker/Biomarker_LR_RF_SVM.csv")

ExprSet <- readRDS("./data/result/ExpSetObject/MergeExpSet_VoomSNM_VoomSNM_LIRI-JP_TCGA-LIHC.RDS")

ExprSet_GSE14520 <- readRDS("./data/result/ExpSetObject/GSE14520_ExpSet_counts.RDS")

23.3 函数

  • get_raincloud:组间表达值的云雨图,使用ggdist(Kay 2023)gghalves(Tiedemann 2022)R包;

  • get_ROC:特征的ROC曲线,使用multipleROC(Moon 2019)R包。


get_raincloud <- function(
  dat,
  group,
  group_names = grp_names,
  group_colors = grp_colors,
  measures,
  angle = 30) {
  
  MergeData <- metadata %>%
    dplyr::select(SampleID, Group) %>%
    dplyr::inner_join(profile %>%
                        tibble::rownames_to_column("SampleID"),
                      by = "SampleID") %>%
    tibble::column_to_rownames("SampleID") %>%
    dplyr::mutate(Group = factor(Group, levels = grp_names))  
  
  # group for plot x-label  
  dat_cln2 <- MergeData
  colnames(dat_cln2)[which(colnames(dat_cln2) == group)] <- "Group_new"
  
  if (group_names[1] == "all") {
    tempdata <- dat_cln2
  } else {
    tempdata <- dat_cln2 %>%
      dplyr::filter(Group_new %in% group_names)
  }
  tempdata$Group_new <- factor(tempdata$Group_new, levels = group_names)
  
  if (length(measures) > 1) {
    pl <- ggplot(plotdata, aes(x = Group_new, y = Values, fill = Group_new)) +
      ggdist::stat_halfeye(adjust = 0.5, width = 0.3,
                           .width = 0, justification = -0.3, point_colour = NA) +
      stat_boxplot(aes(color = Group_new), geom = "errorbar", width = 0.1) +    
      geom_boxplot(width = 0.1, outlier.shape = NA) + 
      gghalves::geom_half_point(side = "l", range_scale = 0.4, alpha = 0.5) +
      stat_summary(geom = "crossbar", width = 0.08, fatten = 0, color = "white", 
               fun.data = function(x){c(y = median(x), ymin = median(x), ymax = median(x))}) +       
      labs(x = "", y = "Value") + 
      scale_fill_manual(values = group_colors) +
      scale_color_manual(values = group_colors) +
      guides(fill = "none", color = "none") + 
      scale_y_continuous(expand = expansion(mult = c(0.1, 0.1))) +
      ggpubr::stat_compare_means(method = "wilcox.test",
                                 comparisons = cmp) +
      facet_wrap(.~ Index, scales = "free", nrow = 2) +      
      theme_bw() +
      theme(axis.title.y = element_text(size = 11, face = "bold"),
            axis.text.y = element_text(size = 10),
            axis.text.x = element_text(size = 9, hjust = .5, vjust = .5, angle = angle),
            strip.text = element_text(size = 10, face = "bold", color = "black"),
            text = element_text(family = "serif"))    
  } else {
    pl <- ggplot(plotdata, aes(x = Group_new, y = Values, fill = Group_new)) +
      ggdist::stat_halfeye(adjust = 0.5, width = 0.3,
                           .width = 0, justification = -0.3, point_colour = NA) +
      stat_boxplot(aes(color = Group_new), geom = "errorbar", width = 0.1) +    
      geom_boxplot(width = 0.1, outlier.shape = NA) + 
      gghalves::geom_half_point(side = "l", range_scale = 0.4, alpha = 0.5) +
      stat_summary(geom = "crossbar", width = 0.08, fatten = 0, color = "white", 
               fun.data = function(x){c(y = median(x), ymin = median(x), ymax = median(x))}) +       
      labs(x = "", y = measures) + 
      scale_fill_manual(values = group_colors) +
      scale_color_manual(values = group_colors) +
      guides(fill = "none", color = "none") + 
      scale_y_continuous(expand = expansion(mult = c(0.1, 0.1))) +
      ggpubr::stat_compare_means(method = "wilcox.test",
                                 comparisons = cmp) +
      theme_bw() +
      theme(axis.title.y = element_text(size = 11, face = "bold"),
            axis.text.y = element_text(size = 10),
            axis.text.x = element_text(size = 9, hjust = .5, vjust = .5, angle = angle),
            strip.text = element_text(size = 10, face = "bold", color = "black"),
            text = element_text(family = "serif"))    
  }
  
  return(pl)
}

get_ROC <- function(
    dat,
    group,
    index,
    type = c(1, 2, 3)) {
  
  if (type == 1) {
    print(with(dat_cln, roc(Stage ~ Feature)))
  } else if (type == 2) {
    pROC_obj <- roc(
      dat_cln$Stage,
      dat_cln$Feature,
      smoothed = TRUE,
      # arguments for ci
      ci = TRUE, 
      ci.alpha = 0.9, 
      stratified = FALSE,
      # arguments for plot
      plot = TRUE, 
      auc.polygon = TRUE, 
      max.auc.polygon = TRUE, 
      grid = TRUE,
      print.auc = TRUE, 
      show.thres = TRUE)
    
    sens.ci <- ci.se(pROC_obj)
    plot(sens.ci, type = "shape", col = "lightblue")
    plot(sens.ci, type = "bars")     
  } else if (type == 3) {
    multipleROC(Stage ~ Feature, data = dat_cln)
  } else if (type == 4) {

    ROC_fun <- function (formula, data, plot = TRUE, threshold) {
      fit = glm(formula, data = data, family = "binomial")
      fit = glm(Stage ~ Feature, data = dat_cln, family = "binomial")
      df = calSens(fit$fitted.values, fit$y)
      no = which.max(df$sum)
      cutpoint = threshold #df$x[no]
      sens = paste("Sens:", sprintf("%03.1f", df[no, ]$sens * 
        100), "%\n", "Spec:", sprintf("%03.1f", df[no, ]$spec * 
        100), "%\n", "PPV:", sprintf("%03.1f", df[no, ]$ppv * 
        100), "%\n", "NPV:", sprintf("%03.1f", df[no, ]$npv * 
        100), "%\n", sep = "")
      auc = simpleAUC(df)
      cutoff = fit$model[which(fit$fitted == cutpoint), ][-1]
      result = list(fit = fit, df = df, cutpoint = cutpoint, sens = sens, 
        auc = auc, cutoff = cutoff)
      class(result) = "multipleROC"
      if (plot) 
        print(plot(result))
      invisible(result)
    }
    ROC_fun(Stage ~ Feature, data = dat_cln, threshold = cutoff)
  }
}

23.4 重要特征的表达

  • 采用云雨图展示组间差异结果(发现数据集

rain_dis_pl <- get_raincloud(
  dat = ExprSet,
  group = "Group",
  group_names = grp_names,
  group_colors = grp_colors,
  measures = common_feature$gene,
  angle = 30)

rain_dis_pl

结果:重要特征在发现数据集的组间表达(wilcox.test的p值作为显著性水平)

  1. 在癌症早期Early富集的基因:FTCD,CYP2C9,CNGA1,ACSL6;

  2. 在癌症晚期Late富集的基因:SLC6A8,ANGPT2,ENO1,KCNJ15,SLC39A4,ETV1;

  • 采用云雨图展示组间差异结果(验证数据集GSE14520

rain_val_pl <- get_raincloud(
  dat = ExprSet_GSE14520,
  group = "Group",
  group_names = grp_names,
  group_colors = grp_colors,
  measures = common_feature$gene,
  angle = 30)

rain_val_pl
  • 重要特征的表达结果总结

common_feature_consis_all <- data.frame(
  FeatureID = c("FTCD", "CYP2C9", "CNGA1", "SLC6A8", "ANGPT2", "ENO1",
                "ACSL6", "KCNJ15", "ETV1", "SLC39A4"),
  Enrich = c(rep("Both_Early", 3), rep("Both_Late", 3), 
             rep("Non-consist", 4))
)


common_feature_consis <- data.frame(
  FeatureID = c("FTCD", "CYP2C9", "CNGA1", "SLC6A8", "ANGPT2", "ENO1"),
  Enrich = c(rep("Both_Early", 3), rep("Both_Late", 3))
)

head(common_feature_consis)

23.5 ROC分析

ROC曲线(Receiver Operating Characteristic Curve)来评估每个核心特征区分肝癌(HCC)早晚期的能力。

  • 准备数据:发现和验证数据集合并元数据和表达谱

MergeData_dis <- pData(ExprSet) %>%
  dplyr::select(SampleID, Group) %>%
  dplyr::inner_join(exprs(ExprSet) %>%
                      as.data.frame() %>%
                      t() %>%
                      as.data.frame() %>%
                      tibble::rownames_to_column("SampleID"),
                    by = "SampleID") %>%
  tibble::column_to_rownames("SampleID") %>%
  dplyr::mutate(Group = factor(Group, levels = grp_names))

head(MergeData_dis[, 1:6])


MergeData_val <- pData(ExprSet_GSE14520) %>%
  dplyr::select(SampleID, Group) %>%
  dplyr::inner_join(exprs(ExprSet_GSE14520) %>%
                      as.data.frame() %>%
                      t() %>%
                      as.data.frame() %>%
                      tibble::rownames_to_column("SampleID"),
                    by = "SampleID") %>%
  tibble::column_to_rownames("SampleID") %>%
  dplyr::mutate(Group = factor(Group, levels = grp_names))

head(MergeData_val[, 1:6])
  • ROC曲线展示区分癌症分期能力(发现数据集

for (gene in common_feature_consis$FeatureID) {

  temp_roc_pl <- get_ROC(
      dat = MergeData_dis,
      group = "Group",
      index = gene,
      type = 3)
}
  • ROC曲线展示区分癌症分期能力(验证数据集GSE14520

for (gene in common_feature_consis$FeatureID) {

  temp_roc_pl <- get_ROC(
      dat = MergeData_val,
      group = "Group",
      index = gene,
      type = 3)
}

23.6 汇总筛选结果

通过表达值一致性AUC两个指标,我们确定了最终的候选核心特征,它们便是SLC6A8ANGPT2,但SLC6A8的AUC在验证数据集更高一些。现在我们汇总这两个核心特征的表达值一致性AUC结果。

  • 采用云雨图展示组间差异结果

rain_dis_SLC6A8 <- get_raincloud(
  dat = ExprSet,
  group = "Group",
  group_names = grp_names,
  group_colors = grp_colors,
  measures = "SLC6A8",
  angle = 30)

rain_dis_SLC6A8

rain_val_SLC6A8 <- get_raincloud(
  dat = ExprSet_GSE14520,
  group = "Group",
  group_names = grp_names,
  group_colors = grp_colors,
  measures = "SLC6A8",
  angle = 30)

rain_val_SLC6A8

结果:核心特征SLC6A8在数据集的组间表达

  • ROC曲线展示区分癌症分期能力

dis_roc_SLC6A8 <- get_ROC(
    dat = MergeData_dis,
    group = "Group",
    index = "SLC6A8",
    type = 3)

val_roc_SLC6A8 <- get_ROC(
    dat = MergeData_val,
    group = "Group",
    index = "SLC6A8",
    type = 3)

23.7 输出结果


if (!dir.exists("./data/result/Biomarker/")) {
  dir.create("./data/result/Biomarker/", recursive = TRUE)
}

write.csv(common_feature_consis_all, "./data/result/Biomarker/Biomarker_LR_RF_SVM_validation.csv", row.names = F)


if (!dir.exists("./data/result/Figure/")) {
  dir.create("./data/result/Figure/", recursive = TRUE)
}

ggsave("./data/result/Figure/Fig5-B.pdf", rain_dis_SLC6A8, width = 5, height = 4, dpi = 600) 
ggsave("./data/result/Figure/Fig5-C.pdf", rain_val_SLC6A8, width = 5, height = 4, dpi = 600) 

pdf("./data/result/Figure/Fig5-D.pdf", width = 5, height = 4)  
dis_roc_SLC6A8 <- get_ROC(
    dat = MergeData_dis,
    group = "Group",
    index = "SLC6A8",
    type = 3)
dev.off() 

pdf("./data/result/Figure/Fig5-E.pdf", width = 5, height = 4)  
val_roc_SLC6A8 <- get_ROC(
    dat = MergeData_val,
    group = "Group",
    index = "SLC6A8",
    type = 3)
dev.off() 


ggsave("./data/result/Figure/SFig4-A.pdf", rain_dis_pl, width = 9, height = 7, dpi = 600) 
ggsave("./data/result/Figure/SFig4-B.pdf", rain_val_pl, width = 9, height = 7, dpi = 600) 

for (gene in common_feature_consis$FeatureID) {

  filename <- paste0("./data/result/Figure/", "SFig4-C-discovery-", gene, ".pdf")
  pdf(filename, width = 5, height = 4)  
  temp_roc_pl_dis <- get_ROC(
      dat = MergeData_dis,
      group = "Group",
      index = gene,
      type = 3)
  dev.off()   
}

for (gene in common_feature_consis$FeatureID) {

  filename <- paste0("./data/result/Figure/", "SFig4-D-validation-", gene, ".pdf")
  pdf(filename, width = 5, height = 4)  
  temp_roc_pl_val <- get_ROC(
      dat = MergeData_val,
      group = "Group",
      index = gene,
      type = 3)
  dev.off()   
}

23.8 总结

在深入分析数据集的过程中,我们特别关注于识别那些对区分样本早晚期状态具有重要意义的特征。通过对发现数据集和验证数据集上的关键特征进行细致比较,我们评估了这些特征的表达一致性和其基于受试者工作特性曲线(ROC)的区分能力。

系统信息
sessionInfo()
R version 4.3.3 (2024-02-29)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Asia/Shanghai
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices datasets  utils     methods   base     

other attached packages:
 [1] gghalves_0.1.4      ggdist_3.3.2        multipleROC_0.1.1  
 [4] pROC_1.18.5         Biobase_2.62.0      BiocGenerics_0.48.1
 [7] data.table_1.15.4   lubridate_1.9.3     forcats_1.0.0      
[10] stringr_1.5.1       dplyr_1.1.4         purrr_1.0.2        
[13] readr_2.1.5         tidyr_1.3.1         tibble_3.2.1       
[16] ggplot2_3.5.1       tidyverse_2.0.0    

loaded via a namespace (and not attached):
 [1] utf8_1.2.4           generics_0.1.3       renv_1.0.0          
 [4] stringi_1.8.4        hms_1.1.3            digest_0.6.35       
 [7] magrittr_2.0.3       evaluate_0.23        grid_4.3.3          
[10] timechange_0.3.0     fastmap_1.1.1        plyr_1.8.9          
[13] jsonlite_1.8.8       BiocManager_1.30.23  fansi_1.0.6         
[16] scales_1.3.0         cli_3.6.2            rlang_1.1.3         
[19] munsell_0.5.1        withr_3.0.0          yaml_2.3.8          
[22] tools_4.3.3          tzdb_0.4.0           colorspace_2.1-0    
[25] vctrs_0.6.5          R6_2.5.1             lifecycle_1.0.4     
[28] htmlwidgets_1.6.4    pkgconfig_2.0.3      pillar_1.9.0        
[31] gtable_0.3.5         glue_1.7.0           Rcpp_1.0.12         
[34] xfun_0.43            tidyselect_1.2.1     rstudioapi_0.16.0   
[37] knitr_1.46           htmltools_0.5.8.1    rmarkdown_2.26      
[40] compiler_4.3.3       distributional_0.4.0