8 数据对象

ExpressionSet是Biobase(Huber 等 2015)包提供的一种数据对象，专为存储生物学实验数据而设计。它允许用户将表型数据、表达谱以及特征数据（如基因或蛋白质标识符）集成在一个统一的数据结构中。在本章节中，将利用这种数据对象来组织并处理获得的三个数据集：LIRI-JP，LIHC-US/TCGA-LIHC和GSE14520。

8.1 LIRI-JP

要生成LIRI-JP的ExpressionSet对象，首先需要加载必要的R包，然后导入数据和元数据，最后将这些数据组合成一个ExpressionSet对象。

8.1.1 加载R包

加载R包：使用rm(list = ls())清空环境内变量

library(tidyverse)
library(data.table)
library(Biobase)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#1F78B4", "#EE2B2B")

8.1.2 导入数据

输入数据来自于章节 7

LIRIJP_count_data <- fread("./data/result/profile/LIRI-JP_gene_counts.tsv")
LIRIJP_relative_data <- fread("./data/result/profile/LIRI-JP_gene_relative.tsv") 
LIRIJP_phenotype <- read.csv("./data/result/metadata/LIRI-JP-specimen.csv")

8.1.3 生成ExpressionSet

生成ExpressionSet-class数据对象：过滤掉低出现率的基因，默认出现率阈值是20%

get_ExprSet_LIRIJP <- function(
    profile,
    metadata,
    occurrence = 0.2) {
  
  profile_new <- profile %>%
    tibble::column_to_rownames("V1")
  metadata_new <- metadata %>%
    dplyr::filter(Tumour_Stage != "") %>%
    dplyr::mutate(Group = case_when(
      Tumour_Stage %in% c("T1", "T2") ~ "Early Stage", 
      Tumour_Stage %in% c("T3", "T4") ~ "Late Stage"))
  
  # metadata Description
  sid <- dplyr::intersect(colnames(profile_new), metadata_new$SpecimenID)
  phen <- metadata_new[pmatch(sid, metadata_new$SpecimenID), , ]
  phen <- phen[pmatch(unique(phen$PatientID), phen$PatientID), , ]
  prof <- profile_new %>%
    dplyr::select(dplyr::all_of(phen$SpecimenID))
  
  if (all(colnames(prof) == phen$SpecimenID)) {
    colnames(prof) <- phen$PatientID
  }
  
  # filter features according gene symbol
  prof_cln <- prof %>%
    tibble::rownames_to_column("Gene") %>%
    data.frame()
  idx <- grep("Gene", colnames(prof_cln))
  prof_cln$median <- apply(prof_cln[, -idx], 1, median)
  prof_cln <- with(prof_cln, prof_cln[order(Gene, median, decreasing = T), ])
  prf_deduplicated <- prof_cln[!duplicated(prof_cln$Gene), ] %>%
    dplyr::select(-median)
  
  # return result: gene symbol
  rownames(prf_deduplicated) <- NULL
  prof_res <- prf_deduplicated %>%
    dplyr::select(Gene, everything()) %>%
    tibble::column_to_rownames("Gene")
  
  # determine the right order between profile and phenotype
  for (i in 1:ncol(prof_trim)) {
    if (!(colnames(prof_trim)[i] == rownames(phen)[i])) {
      stop(paste0(i, " Wrong"))
    }
  }
  
  expressionSet <- new("ExpressionSet", 
                       exprs = exprs,
                       phenoData = adf,
                       experimentData = experimentData)
  
  return(expressionSet)
}

LIRIJP_ExprSet_count <- get_ExprSet_LIRIJP(
  profile = LIRIJP_count_data,
  metadata = LIRIJP_phenotype,
  occurrence = 0.2)

LIRIJP_ExprSet_rela <- get_ExprSet_LIRIJP(
  profile = LIRIJP_relative_data,
  metadata = LIRIJP_phenotype,
  occurrence = 0.2)
LIRIJP_ExprSet_rela

8.1.4 输出结果

输出结果: 将上述结果以RDS格式输出到结果目录

if (!dir.exists("./data/result/ExpSetObject/")) {
  dir.create("./data/result/ExpSetObject/", recursive = TRUE)
}

saveRDS(LIRIJP_ExprSet_count, "./data/result/ExpSetObject/LIRI-JP_ExpSet_counts.RDS", compress = TRUE)
saveRDS(LIRIJP_ExprSet_rela, "./data/result/ExpSetObject/LIRI-JP_ExpSet_relative.RDS", compress = TRUE)

8.2 TCGA-LIHC

要生成TCGA-LIHC的ExpressionSet对象，首先需要加载必要的R包，然后导入数据和元数据，最后将这些数据组合成一个ExpressionSet对象。

8.2.1 加载R包

加载R包：使用rm(list = ls())清空环境内变量

library(tidyverse)
library(data.table)
library(Biobase)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#1F78B4", "#EE2B2B")

8.2.2 导入数据


TCGALIHC_count_data <- fread("./data/result/profile/TCGA-LIHC_gene_counts.tsv")
TCGALIHC_TrueCount_data <- fread("./data/result/profile/TCGA-LIHC_gene_Truecounts.tsv")
TCGALIHC_fpkm_data <- fread("./data/result/profile/TCGA-LIHC_gene_fpkm.tsv") 
TCGALIHC_phenotype <- read.csv("./data/result/metadata/TCGA-LIHC-clinical.csv")

8.2.3 生成ExpressionSet

生成ExpressionSet-class数据对象：过滤掉低出现率的基因，默认出现率阈值是20%


get_ExprSet_TCGALIHC <- function(
    profile,
    metadata,
    occurrence = 0.2) {
  
  profile_new <- profile %>%
    tibble::column_to_rownames("V1")
  metadata_new <- metadata %>%
    dplyr::filter(Tumour_Stage != "") %>%
    dplyr::mutate(Group = case_when(
      Tumour_Stage %in% c("T1", "T2") ~ "Early Stage", 
      Tumour_Stage %in% c("T3", "T4") ~ "Late Stage")) %>%
    dplyr::mutate(Time = ifelse(is.na(death_Time), follow_up_Time, death_Time))
  
  # metadata Description
  sid <- dplyr::intersect(colnames(profile_new), metadata_new$PatientID)
  phen <- metadata_new[pmatch(sid, metadata_new$PatientID), , ]
  prof <- profile_new %>%
    dplyr::select(dplyr::all_of(phen$PatientID))
  
  # filter features according gene symbol
  prof_cln <- prof %>%
    tibble::rownames_to_column("Gene") %>%
    data.frame()
  idx <- grep("Gene", colnames(prof_cln))
  prof_cln$median <- apply(prof_cln[, -idx], 1, median)
  prof_cln <- with(prof_cln, prof_cln[order(Gene, median, decreasing = T), ])
  prf_deduplicated <- prof_cln[!duplicated(prof_cln$Gene), ] %>%
    dplyr::select(-median)
  
  # return result: gene symbol
  rownames(prf_deduplicated) <- NULL
  prof_res <- prf_deduplicated %>%
    dplyr::select(Gene, everything()) %>%
    tibble::column_to_rownames("Gene")
  
  # determine the right order between profile and phenotype
  for (i in 1:ncol(prof_trim)) {
    if (!(colnames(prof_trim)[i] == rownames(phen)[i])) {
      stop(paste0(i, " Wrong"))
    }
  }

  expressionSet <- new("ExpressionSet", 
                       exprs = exprs,
                       phenoData = adf,
                       experimentData = experimentData)
  
  return(expressionSet)
}

TCGALIHC_ExprSet_count <- get_ExprSet_TCGALIHC(
  profile = TCGALIHC_count_data,
  metadata = TCGALIHC_phenotype,
  occurrence = 0.2)
# TCGALIHC_ExprSet_count

TCGALIHC_ExprSet_TrueCount <- get_ExprSet_TCGALIHC(
  profile = TCGALIHC_TrueCount_data,
  metadata = TCGALIHC_phenotype,
  occurrence = 0.2)
# TCGALIHC_ExprSet_TrueCount

TCGALIHC_ExprSet_fpkm <- get_ExprSet_TCGALIHC(
  profile = TCGALIHC_fpkm_data,
  metadata = TCGALIHC_phenotype,
  occurrence = 0.2)
TCGALIHC_ExprSet_fpkm

8.2.4 输出结果

输出结果: 将上述结果以RDS格式输出到结果目录


if (!dir.exists("./data/result/ExpSetObject/")) {
  dir.create("./data/result/ExpSetObject/", recursive = TRUE)
}

saveRDS(TCGALIHC_ExprSet_count, "./data/result/ExpSetObject/TCGA-LIHC_ExpSet_counts.RDS", compress = TRUE)
saveRDS(TCGALIHC_ExprSet_TrueCount, "./data/result/ExpSetObject/TCGA-LIHC_ExpSet_TrueCount.RDS", compress = TRUE)
saveRDS(TCGALIHC_ExprSet_fpkm, "./data/result/ExpSetObject/TCGA-LIHC_ExpSet_fpkm.RDS", compress = TRUE)

8.3 GSE14520

要生成GSE14520的ExpressionSet对象，首先需要加载必要的R包，然后导入数据和元数据，最后将这些数据组合成一个ExpressionSet对象。

8.3.1 加载R包

加载R包：使用rm(list = ls())清空环境内变量

library(tidyverse)
library(data.table)
library(Biobase)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#1F78B4", "#EE2B2B")

8.3.2 导入数据

导入数据


GSE14520_count_data <- fread("./data/result/profile/GSE14520_gene_counts.tsv")
GSE14520_phenotype <- read.csv("./data/result/metadata/GSE14520-clinical.csv")

8.3.3 生成ExpressionSet

生成ExpressionSet-class数据对象：过滤掉低出现率的基因，默认出现率阈值是20%


get_ExprSet_GSE14520 <- function(
    profile,
    metadata,
    occurrence = 0.2) {
  
  profile_new <- profile %>%
    tibble::column_to_rownames("V1")
  metadata_new <- metadata %>%
    dplyr::filter(Tumour_Stage != "") %>%
    dplyr::mutate(Group = case_when(
      Tumour_Stage %in% c("T1", "T2") ~ "Early Stage", 
      Tumour_Stage %in% c("T3", "T4") ~ "Late Stage")) 
  
  # metadata Description
  sid <- dplyr::intersect(colnames(profile_new), metadata_new$SampleID)
  phen <- metadata_new[pmatch(sid, metadata_new$SampleID), , ]
  prof <- profile_new %>%
    dplyr::select(dplyr::all_of(phen$SampleID))
  
  
  # return result: gene symbol
  rownames(prf_deduplicated) <- NULL
  prof_res <- prf_deduplicated %>%
    dplyr::select(Gene, everything()) %>%
    tibble::column_to_rownames("Gene")
  
  # determine the right order between profile and phenotype
  for (i in 1:ncol(prof_trim)) {
    if (!(colnames(prof_trim)[i] == rownames(phen)[i])) {
      stop(paste0(i, " Wrong"))
    }
  }
  
  expressionSet <- new("ExpressionSet", 
                       exprs = exprs,
                       phenoData = adf,
                       experimentData = experimentData)
  
  return(expressionSet)
}

GSE14520_ExprSet_count <- get_ExprSet_GSE14520(
  profile = GSE14520_count_data,
  metadata = GSE14520_phenotype,
  occurrence = 0.2)
GSE14520_ExprSet_count

8.3.4 输出结果

输出结果: 将上述结果以RDS格式输出到结果目录


if (!dir.exists("./data/result/ExpSetObject/")) {
  dir.create("./data/result/ExpSetObject/", recursive = TRUE)
}

saveRDS(GSE14520_ExprSet_count, "./data/result/ExpSetObject/GSE14520_ExpSet_counts.RDS", compress = TRUE)

8.4 重叠基因

为了确保在发现数据集中识别的标记基因能够在验证数据集中进行准确的验证，首先加载了基于count abundance的RDS，这些RDS分别包含LIRI-JP，LIHC-US/TCGA-LIHC和GSE14520数据集的基因表达信息。

步骤：

加载RDS文件：首先，将分别加载LIRI-JP，LIHC-US/TCGA-LIHC和GSE14520的count abundance的RDS文件。
提取基因标识符：从每个数据集中提取基因标识符列表。
获取基因交集：比较这三个基因标识符列表，找出它们之间的交集。这个交集将包含所有在三个数据集中都存在的基因。
更新数据集：使用交集中的基因标识符来过滤和更新每个数据集，以确保后续分析中的基因一致性。

8.4.1 加载R包

加载R包

library(tidyverse)
library(data.table)
library(Biobase)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#1F78B4", "#EE2B2B")

8.4.2 导入数据

导入数据

ExprSet_LIRI_JP <- readRDS("./data/result/ExpSetObject/LIRI-JP_ExpSet_counts.RDS")
ExprSet_TCGA_LIHC <- readRDS("./data/result/ExpSetObject/TCGA-LIHC_ExpSet_TrueCount.RDS")
ExprSet_GSE14520 <- readRDS("./data/result/ExpSetObject/GSE14520_ExpSet_counts.RDS")

8.4.3 共有基因

共有基因


length(overlap_genes)

8.4.4 生成数据对象

生成数据对象


get_ExprSet <- function(
    dat,
    gene_list = overlap_genes,
    occurrence = 0.2) {
  
  profile <- exprs(dat) %>%
    as.data.frame() %>%
    t() %>%
    as.data.frame() %>%
    dplyr::select(all_of(overlap_genes)) %>%
    t() %>%
    as.data.frame()
  metadata <- pData(dat) %>%
    as.data.frame()
  
  # metadata Description
  sid <- dplyr::intersect(colnames(profile), rownames(metadata))
  phen <- metadata[pmatch(sid, rownames(metadata)), , ]
  prof <- profile %>%
    dplyr::select(dplyr::all_of(rownames(phen)))
  
  # filter features according gene symbol
  prof_cln <- prof %>%
    tibble::rownames_to_column("Gene") %>%
    data.frame()
  idx <- grep("Gene", colnames(prof_cln))
  prof_cln$median <- apply(prof_cln[, -idx], 1, median)
  prof_cln <- with(prof_cln, prof_cln[order(Gene, median, decreasing = T), ])
  prf_deduplicated <- prof_cln[!duplicated(prof_cln$Gene), ] %>%
    dplyr::select(-median)

  
  # determine the right order between profile and phenotype
  for (i in 1:ncol(prof_trim)) {
    if (!(colnames(prof_trim)[i] == rownames(phen)[i])) {
      stop(paste0(i, " Wrong"))
    }
  }
  expressionSet <- new("ExpressionSet", 
                       exprs = exprs,
                       phenoData = adf,
                       experimentData = experimentData)
  
  return(expressionSet)
}


ExprSet_LIRI_JP_new <- get_ExprSet(
  dat = ExprSet_LIRI_JP,
  gene_list = overlap_genes,
  occurrence = 0.2)
ExprSet_LIRI_JP_new

ExprSet_TCGA_LIHC_new <- get_ExprSet(
  dat = ExprSet_TCGA_LIHC,
  gene_list = overlap_genes,
  occurrence = 0.2)
ExprSet_TCGA_LIHC_new

ExprSet_GSE14520_new <- get_ExprSet(
  dat = ExprSet_GSE14520,
  gene_list = overlap_genes,
  occurrence = 0.2)
ExprSet_GSE14520_new

8.4.5 输出结果

输出结果: 将上述结果以RDS格式输出到结果目录


if (!dir.exists("./data/result/ExpSetObject/")) {
  dir.create("./data/result/ExpSetObject/", recursive = TRUE)
}

saveRDS(ExprSet_LIRI_JP_new, "./data/result/ExpSetObject/Final_ExpSet_LIRI_JP_TrueCounts.RDS", compress = TRUE)
saveRDS(ExprSet_TCGA_LIHC_new, "./data/result/ExpSetObject/Final_ExpSet_TCGA_LIHC_TrueCounts.RDS", compress = TRUE)
saveRDS(ExprSet_GSE14520_new, "./data/result/ExpSetObject/Final_ExpSet_GSE14520_TrueCounts.RDS", compress = TRUE)

8.5 总结

在完成了对三个肝细胞癌（HCC）群体的转录组和临床数据的预处理后，分别将这些数据转换为了ExpressionSet数据对象，并保存为RDS文件格式，以便于后续的数据管理和分析。为了确保后续分析的一致性和可靠性，进一步从这些RDS文件中提取了三个数据集中共有的基因。这些共有的基因是后续分析的基础，因为它们代表了三个群体中均存在并可能参与HCC发展的生物学过程。

系统信息

sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Asia/Shanghai
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices datasets  utils     methods   base     

other attached packages:
 [1] Biobase_2.62.0      BiocGenerics_0.48.1 data.table_1.15.4  
 [4] lubridate_1.9.3     forcats_1.0.0       stringr_1.5.1      
 [7] dplyr_1.1.4         purrr_1.0.2         readr_2.1.5        
[10] tidyr_1.3.1         tibble_3.2.1        ggplot2_3.5.1      
[13] tidyverse_2.0.0    

loaded via a namespace (and not attached):
 [1] gtable_0.3.5        jsonlite_1.8.8      compiler_4.3.3     
 [4] BiocManager_1.30.23 renv_1.0.0          tidyselect_1.2.1   
 [7] scales_1.3.0        yaml_2.3.8          fastmap_1.1.1      
[10] R6_2.5.1            generics_0.1.3      knitr_1.46         
[13] htmlwidgets_1.6.4   munsell_0.5.1       tzdb_0.4.0         
[16] pillar_1.9.0        rlang_1.1.3         utf8_1.2.4         
[19] stringi_1.8.4       xfun_0.43           timechange_0.3.0   
[22] cli_3.6.2           withr_3.0.0         magrittr_2.0.3     
[25] digest_0.6.35       grid_4.3.3          rstudioapi_0.16.0  
[28] hms_1.1.3           lifecycle_1.0.4     vctrs_0.6.5        
[31] evaluate_0.23       glue_1.7.0          fansi_1.0.6        
[34] colorspace_2.1-0    rmarkdown_2.26      tools_4.3.3        
[37] pkgconfig_2.0.3     htmltools_0.5.8.1