8 数据对象
ExpressionSet是Biobase(Huber 等 2015)包提供的一种数据对象,专为存储生物学实验数据而设计。它允许用户将表型数据、表达谱以及特征数据(如基因或蛋白质标识符)集成在一个统一的数据结构中。在本章节中,将利用这种数据对象来组织并处理获得的三个数据集:LIRI-JP
,LIHC-US/TCGA-LIHC
和GSE14520
。
8.1 LIRI-JP
要生成LIRI-JP
的ExpressionSet对象,首先需要加载必要的R包,然后导入数据和元数据,最后将这些数据组合成一个ExpressionSet对象。
8.1.1 加载R包
加载R包:使用rm(list = ls())
清空环境内变量
8.1.2 导入数据
输入数据来自于 章节 7
8.1.3 生成ExpressionSet
生成ExpressionSet-class数据对象:过滤掉低出现率的基因,默认出现率阈值是20%
get_ExprSet_LIRIJP <- function(
profile,
metadata,
occurrence = 0.2) {
profile_new <- profile %>%
tibble::column_to_rownames("V1")
metadata_new <- metadata %>%
dplyr::filter(Tumour_Stage != "") %>%
dplyr::mutate(Group = case_when(
Tumour_Stage %in% c("T1", "T2") ~ "Early Stage",
Tumour_Stage %in% c("T3", "T4") ~ "Late Stage"))
# metadata Description
sid <- dplyr::intersect(colnames(profile_new), metadata_new$SpecimenID)
phen <- metadata_new[pmatch(sid, metadata_new$SpecimenID), , ]
phen <- phen[pmatch(unique(phen$PatientID), phen$PatientID), , ]
prof <- profile_new %>%
dplyr::select(dplyr::all_of(phen$SpecimenID))
if (all(colnames(prof) == phen$SpecimenID)) {
colnames(prof) <- phen$PatientID
}
# filter features according gene symbol
prof_cln <- prof %>%
tibble::rownames_to_column("Gene") %>%
data.frame()
idx <- grep("Gene", colnames(prof_cln))
prof_cln$median <- apply(prof_cln[, -idx], 1, median)
prof_cln <- with(prof_cln, prof_cln[order(Gene, median, decreasing = T), ])
prf_deduplicated <- prof_cln[!duplicated(prof_cln$Gene), ] %>%
dplyr::select(-median)
# return result: gene symbol
rownames(prf_deduplicated) <- NULL
prof_res <- prf_deduplicated %>%
dplyr::select(Gene, everything()) %>%
tibble::column_to_rownames("Gene")
# determine the right order between profile and phenotype
for (i in 1:ncol(prof_trim)) {
if (!(colnames(prof_trim)[i] == rownames(phen)[i])) {
stop(paste0(i, " Wrong"))
}
}
expressionSet <- new("ExpressionSet",
exprs = exprs,
phenoData = adf,
experimentData = experimentData)
return(expressionSet)
}
LIRIJP_ExprSet_count <- get_ExprSet_LIRIJP(
profile = LIRIJP_count_data,
metadata = LIRIJP_phenotype,
occurrence = 0.2)
LIRIJP_ExprSet_rela <- get_ExprSet_LIRIJP(
profile = LIRIJP_relative_data,
metadata = LIRIJP_phenotype,
occurrence = 0.2)
LIRIJP_ExprSet_rela
8.1.4 输出结果
输出结果: 将上述结果以RDS格式输出到结果目录
if (!dir.exists("./data/result/ExpSetObject/")) {
dir.create("./data/result/ExpSetObject/", recursive = TRUE)
}
saveRDS(LIRIJP_ExprSet_count, "./data/result/ExpSetObject/LIRI-JP_ExpSet_counts.RDS", compress = TRUE)
saveRDS(LIRIJP_ExprSet_rela, "./data/result/ExpSetObject/LIRI-JP_ExpSet_relative.RDS", compress = TRUE)
8.2 TCGA-LIHC
要生成TCGA-LIHC
的ExpressionSet对象,首先需要加载必要的R包,然后导入数据和元数据,最后将这些数据组合成一个ExpressionSet对象。
8.2.1 加载R包
加载R包:使用rm(list = ls())
清空环境内变量
8.2.2 导入数据
TCGALIHC_count_data <- fread("./data/result/profile/TCGA-LIHC_gene_counts.tsv")
TCGALIHC_TrueCount_data <- fread("./data/result/profile/TCGA-LIHC_gene_Truecounts.tsv")
TCGALIHC_fpkm_data <- fread("./data/result/profile/TCGA-LIHC_gene_fpkm.tsv")
TCGALIHC_phenotype <- read.csv("./data/result/metadata/TCGA-LIHC-clinical.csv")
8.2.3 生成ExpressionSet
生成ExpressionSet-class数据对象:过滤掉低出现率的基因,默认出现率阈值是20%
get_ExprSet_TCGALIHC <- function(
profile,
metadata,
occurrence = 0.2) {
profile_new <- profile %>%
tibble::column_to_rownames("V1")
metadata_new <- metadata %>%
dplyr::filter(Tumour_Stage != "") %>%
dplyr::mutate(Group = case_when(
Tumour_Stage %in% c("T1", "T2") ~ "Early Stage",
Tumour_Stage %in% c("T3", "T4") ~ "Late Stage")) %>%
dplyr::mutate(Time = ifelse(is.na(death_Time), follow_up_Time, death_Time))
# metadata Description
sid <- dplyr::intersect(colnames(profile_new), metadata_new$PatientID)
phen <- metadata_new[pmatch(sid, metadata_new$PatientID), , ]
prof <- profile_new %>%
dplyr::select(dplyr::all_of(phen$PatientID))
# filter features according gene symbol
prof_cln <- prof %>%
tibble::rownames_to_column("Gene") %>%
data.frame()
idx <- grep("Gene", colnames(prof_cln))
prof_cln$median <- apply(prof_cln[, -idx], 1, median)
prof_cln <- with(prof_cln, prof_cln[order(Gene, median, decreasing = T), ])
prf_deduplicated <- prof_cln[!duplicated(prof_cln$Gene), ] %>%
dplyr::select(-median)
# return result: gene symbol
rownames(prf_deduplicated) <- NULL
prof_res <- prf_deduplicated %>%
dplyr::select(Gene, everything()) %>%
tibble::column_to_rownames("Gene")
# determine the right order between profile and phenotype
for (i in 1:ncol(prof_trim)) {
if (!(colnames(prof_trim)[i] == rownames(phen)[i])) {
stop(paste0(i, " Wrong"))
}
}
expressionSet <- new("ExpressionSet",
exprs = exprs,
phenoData = adf,
experimentData = experimentData)
return(expressionSet)
}
TCGALIHC_ExprSet_count <- get_ExprSet_TCGALIHC(
profile = TCGALIHC_count_data,
metadata = TCGALIHC_phenotype,
occurrence = 0.2)
# TCGALIHC_ExprSet_count
TCGALIHC_ExprSet_TrueCount <- get_ExprSet_TCGALIHC(
profile = TCGALIHC_TrueCount_data,
metadata = TCGALIHC_phenotype,
occurrence = 0.2)
# TCGALIHC_ExprSet_TrueCount
TCGALIHC_ExprSet_fpkm <- get_ExprSet_TCGALIHC(
profile = TCGALIHC_fpkm_data,
metadata = TCGALIHC_phenotype,
occurrence = 0.2)
TCGALIHC_ExprSet_fpkm
8.2.4 输出结果
输出结果: 将上述结果以RDS格式输出到结果目录
if (!dir.exists("./data/result/ExpSetObject/")) {
dir.create("./data/result/ExpSetObject/", recursive = TRUE)
}
saveRDS(TCGALIHC_ExprSet_count, "./data/result/ExpSetObject/TCGA-LIHC_ExpSet_counts.RDS", compress = TRUE)
saveRDS(TCGALIHC_ExprSet_TrueCount, "./data/result/ExpSetObject/TCGA-LIHC_ExpSet_TrueCount.RDS", compress = TRUE)
saveRDS(TCGALIHC_ExprSet_fpkm, "./data/result/ExpSetObject/TCGA-LIHC_ExpSet_fpkm.RDS", compress = TRUE)
8.3 GSE14520
要生成GSE14520
的ExpressionSet对象,首先需要加载必要的R包,然后导入数据和元数据,最后将这些数据组合成一个ExpressionSet对象。
8.3.1 加载R包
加载R包:使用rm(list = ls())
清空环境内变量
8.3.2 导入数据
导入数据
8.3.3 生成ExpressionSet
生成ExpressionSet-class数据对象:过滤掉低出现率的基因,默认出现率阈值是20%
get_ExprSet_GSE14520 <- function(
profile,
metadata,
occurrence = 0.2) {
profile_new <- profile %>%
tibble::column_to_rownames("V1")
metadata_new <- metadata %>%
dplyr::filter(Tumour_Stage != "") %>%
dplyr::mutate(Group = case_when(
Tumour_Stage %in% c("T1", "T2") ~ "Early Stage",
Tumour_Stage %in% c("T3", "T4") ~ "Late Stage"))
# metadata Description
sid <- dplyr::intersect(colnames(profile_new), metadata_new$SampleID)
phen <- metadata_new[pmatch(sid, metadata_new$SampleID), , ]
prof <- profile_new %>%
dplyr::select(dplyr::all_of(phen$SampleID))
# return result: gene symbol
rownames(prf_deduplicated) <- NULL
prof_res <- prf_deduplicated %>%
dplyr::select(Gene, everything()) %>%
tibble::column_to_rownames("Gene")
# determine the right order between profile and phenotype
for (i in 1:ncol(prof_trim)) {
if (!(colnames(prof_trim)[i] == rownames(phen)[i])) {
stop(paste0(i, " Wrong"))
}
}
expressionSet <- new("ExpressionSet",
exprs = exprs,
phenoData = adf,
experimentData = experimentData)
return(expressionSet)
}
GSE14520_ExprSet_count <- get_ExprSet_GSE14520(
profile = GSE14520_count_data,
metadata = GSE14520_phenotype,
occurrence = 0.2)
GSE14520_ExprSet_count
8.3.4 输出结果
输出结果: 将上述结果以RDS格式输出到结果目录
8.4 重叠基因
为了确保在发现数据集中识别的标记基因能够在验证数据集中进行准确的验证,首先加载了基于count abundance的RDS,这些RDS分别包含LIRI-JP
,LIHC-US/TCGA-LIHC
和GSE14520
数据集的基因表达信息。
步骤:
加载RDS文件:首先,将分别加载
LIRI-JP
,LIHC-US/TCGA-LIHC
和GSE14520
的count abundance的RDS文件。提取基因标识符:从每个数据集中提取基因标识符列表。
获取基因交集:比较这三个基因标识符列表,找出它们之间的交集。这个交集将包含所有在三个数据集中都存在的基因。
更新数据集:使用交集中的基因标识符来过滤和更新每个数据集,以确保后续分析中的基因一致性。
8.4.1 加载R包
加载R包
8.4.2 导入数据
导入数据
8.4.3 共有基因
共有基因
8.4.4 生成数据对象
生成数据对象
get_ExprSet <- function(
dat,
gene_list = overlap_genes,
occurrence = 0.2) {
profile <- exprs(dat) %>%
as.data.frame() %>%
t() %>%
as.data.frame() %>%
dplyr::select(all_of(overlap_genes)) %>%
t() %>%
as.data.frame()
metadata <- pData(dat) %>%
as.data.frame()
# metadata Description
sid <- dplyr::intersect(colnames(profile), rownames(metadata))
phen <- metadata[pmatch(sid, rownames(metadata)), , ]
prof <- profile %>%
dplyr::select(dplyr::all_of(rownames(phen)))
# filter features according gene symbol
prof_cln <- prof %>%
tibble::rownames_to_column("Gene") %>%
data.frame()
idx <- grep("Gene", colnames(prof_cln))
prof_cln$median <- apply(prof_cln[, -idx], 1, median)
prof_cln <- with(prof_cln, prof_cln[order(Gene, median, decreasing = T), ])
prf_deduplicated <- prof_cln[!duplicated(prof_cln$Gene), ] %>%
dplyr::select(-median)
# determine the right order between profile and phenotype
for (i in 1:ncol(prof_trim)) {
if (!(colnames(prof_trim)[i] == rownames(phen)[i])) {
stop(paste0(i, " Wrong"))
}
}
expressionSet <- new("ExpressionSet",
exprs = exprs,
phenoData = adf,
experimentData = experimentData)
return(expressionSet)
}
ExprSet_LIRI_JP_new <- get_ExprSet(
dat = ExprSet_LIRI_JP,
gene_list = overlap_genes,
occurrence = 0.2)
ExprSet_LIRI_JP_new
ExprSet_TCGA_LIHC_new <- get_ExprSet(
dat = ExprSet_TCGA_LIHC,
gene_list = overlap_genes,
occurrence = 0.2)
ExprSet_TCGA_LIHC_new
ExprSet_GSE14520_new <- get_ExprSet(
dat = ExprSet_GSE14520,
gene_list = overlap_genes,
occurrence = 0.2)
ExprSet_GSE14520_new
8.4.5 输出结果
输出结果: 将上述结果以RDS格式输出到结果目录
if (!dir.exists("./data/result/ExpSetObject/")) {
dir.create("./data/result/ExpSetObject/", recursive = TRUE)
}
saveRDS(ExprSet_LIRI_JP_new, "./data/result/ExpSetObject/Final_ExpSet_LIRI_JP_TrueCounts.RDS", compress = TRUE)
saveRDS(ExprSet_TCGA_LIHC_new, "./data/result/ExpSetObject/Final_ExpSet_TCGA_LIHC_TrueCounts.RDS", compress = TRUE)
saveRDS(ExprSet_GSE14520_new, "./data/result/ExpSetObject/Final_ExpSet_GSE14520_TrueCounts.RDS", compress = TRUE)
8.5 总结
在完成了对三个肝细胞癌(HCC)群体的转录组和临床数据的预处理后,分别将这些数据转换为了ExpressionSet数据对象,并保存为RDS
文件格式,以便于后续的数据管理和分析。为了确保后续分析的一致性和可靠性,进一步从这些RDS文件中提取了三个数据集中共有的基因。这些共有的基因是后续分析的基础,因为它们代表了三个群体中均存在并可能参与HCC发展的生物学过程。
系统信息
R version 4.3.3 (2024-02-29)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.2
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: Asia/Shanghai
tzcode source: internal
attached base packages:
[1] stats graphics grDevices datasets utils methods base
other attached packages:
[1] Biobase_2.62.0 BiocGenerics_0.48.1 data.table_1.15.4
[4] lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
[7] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
[10] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1
[13] tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] gtable_0.3.5 jsonlite_1.8.8 compiler_4.3.3
[4] BiocManager_1.30.23 renv_1.0.0 tidyselect_1.2.1
[7] scales_1.3.0 yaml_2.3.8 fastmap_1.1.1
[10] R6_2.5.1 generics_0.1.3 knitr_1.46
[13] htmlwidgets_1.6.4 munsell_0.5.1 tzdb_0.4.0
[16] pillar_1.9.0 rlang_1.1.3 utf8_1.2.4
[19] stringi_1.8.4 xfun_0.43 timechange_0.3.0
[22] cli_3.6.2 withr_3.0.0 magrittr_2.0.3
[25] digest_0.6.35 grid_4.3.3 rstudioapi_0.16.0
[28] hms_1.1.3 lifecycle_1.0.4 vctrs_0.6.5
[31] evaluate_0.23 glue_1.7.0 fansi_1.0.6
[34] colorspace_2.1-0 rmarkdown_2.26 tools_4.3.3
[37] pkgconfig_2.0.3 htmltools_0.5.8.1