7 数据预处理

在对上述数据集进行数据预处理时，主要进行了两个关键步骤的清洗工作：

临床表型数据清洗：对临床表型数据进行了全面的检查和清理。首先，检查了数据的完整性，确保没有缺失值或异常值对后续分析造成影响。其次，核对了数据的准确性。最后，对临床表型数据进行了必要的标准化处理，以确保不同数据集之间的数据可以相互比较和整合。
表达谱数据清洗：在表达谱数据的清洗过程中，同样注重数据的完整性和准确性。首先，检查了基因表达数据是否存在缺失值或异常值。其次，对表达谱数据进行了质量控制分析。

7.1 LIRI-JP

下载到的数据有如下：

病人的数据（临床数据）：donor.LIRI-JP.tsv.gz
表达数据（测序数据）：exp_seq.LIRI-JP.tsv.gz
样品信息：sample.LIRI-JP.tsv.gz
突变数据：simple_somatic_mutation.open.LIRI-JP.tsv.gz
实验处理数据：specimen.LIRI-JP.tsv.gz
结构变异数据：structural_somatic_mutation.LIRI-JP.tsv.gz

7.1.1 LIRI-JP临床表型

本节将生成临床患者临床数据以及样本对应的表型数据。

7.1.1.1 加载R包

使用rm(list = ls())清空环境内变量

library(tidyverse)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#1F78B4", "#EE2B2B")

7.1.1.2 导入数据

LIRIJP_metadata <- read.csv("./data/LIRI-JP/donor.LIRI-JP.tsv", sep = "\t")
LIRIJP_sample <- read.csv("./data/LIRI-JP/sample.LIRI-JP.tsv", sep = "\t")
LIRIJP_specimen <- read.csv("./data/LIRI-JP/specimen.LIRI-JP.tsv", sep = "\t")

7.1.1.3 清洗临床数据

清洗临床数据

LIRIJP_clin_columns <- c("icgc_donor_id", "project_code", "submitted_donor_id", 
             "donor_sex", "donor_vital_status", "donor_age_at_diagnosis",
             "donor_tumour_stage_at_diagnosis", "donor_survival_time")

LIRIJP_clinic_data <- LIRIJP_metadata

head(LIRIJP_clinic_data)

7.1.1.4 清洗实验处理数据

LIRIJP_spe_columns <- c("icgc_specimen_id", "project_code", "submitted_specimen_id", 
             "icgc_donor_id", "submitted_donor_id", "specimen_type")

LIRIJP_specimen_data <- LIRIJP_specimen 

head(LIRIJP_specimen_data)

7.1.1.5 清洗样品信息数据

数据清洗: 样品信息

LIRIJP_sam_columns <- c("icgc_sample_id", "project_code", "submitted_sample_id", 
             "icgc_specimen_id", "submitted_specimen_id", "icgc_donor_id",
             "submitted_donor_id")

LIRIJP_sample_data <- LIRIJP_sample

head(LIRIJP_sample_data)

7.1.1.6 输出结果

输出结果: 将上述结果以csv格式输出到结果目录

if (!dir.exists("./data/result/metadata/")) {
  dir.create("./data/result/metadata/", recursive = TRUE)
}

write.csv(LIRIJP_clinic_data, "./data/result/metadata/LIRI-JP-clinical.csv", row.names = F)
write.csv(LIRIJP_sample_data, "./data/result/metadata/LIRI-JP-sample.csv", row.names = F)
write.csv(LIRIJP_specimen_data, "./data/result/metadata/LIRI-JP-specimen.csv", row.names = F)

7.1.2 LIRI-JP转录组

基因表达数据需要进行基因ID转换以及冗余基因的过滤，最后生成符合要求的输出结果。

7.1.2.1 加载R包

library(tidyverse)
library(data.table)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

7.1.2.2 导入数据

LIRIJP_genexp <- fread("./data/LIRI-JP/exp_seq.LIRI-JP.tsv")
LIRIJP_geneID <- fread("./data/GeneIDMap/human_gene_all.tsv")

head(LIRIJP_geneID)

7.1.2.3 数据清洗

数据清洗

LIRIJP_gene_data <- LIRIJP_genexp %>%
  dplyr::select(icgc_specimen_id, gene_id,
                normalized_read_count, raw_read_count) %>%
  dplyr::mutate(gene_id = gsub("\\.\\d+", "", gene_id))

head(LIRIJP_relative_df_final[, 1:6])

7.1.2.4 过滤基因


get_trimGene_LIRIJP <- function(input, occurrence = 0.1) {
  
  input_df <- data.frame(input)
  
  return(prof_res_v2)
}

LIRIJP_counts_df_trim <- get_trimGene_LIRIJP(input = LIRIJP_counts_df_final)
LIRIJP_relative_df_trim <- get_trimGene_LIRIJP(input = LIRIJP_relative_df_final)

dim(LIRIJP_counts_df_trim)

7.1.2.5 输出结果

输出结果: 将上述结果以tsv格式输出到结果目录

if (!dir.exists("./data/result/profile/")) {
  dir.create("./data/result/profile/", recursive = TRUE)
}

write.table(LIRIJP_counts_df_trim, "./data/result/profile/LIRI-JP_gene_counts.tsv", 
            sep = "\t", quote = F, row.names = T)
write.table(LIRIJP_relative_df_trim, "./data/result/profile/LIRI-JP_gene_relative.tsv", 
            sep = "\t", quote = F, row.names = T)

7.2 TCGA-LIHC

options(warn = -1)
library(TCGAbiolinks)
library(dplyr)
library(SummarizedExperiment)
library(optparse)
options(warn = 0)


option_list <- list(
  make_option(c("-t", "--tumor"), type="character", default="TCGA-KIRC", 
              help="tumor type from TCGA", metavar="character"),
  make_option(c("-o", "--out"), type="character",  
              help="output directory", metavar="character")
); 
opt_parser <- OptionParser(option_list = option_list)
opt <- parse_args(opt_parser)

current_dir <- getwd()

# input parameters
tumor_type <- opt$tumor
data_type <- opt$out


# clinical information output
if(!dir.exists(paste0(current_dir, "/Clinical/"))){
  dir.create(paste0(current_dir, "/Clinical/"))
}
origin_clinical_name <- paste0(current_dir, "/Clinical/", tumor_type, "_clinical_origin.csv")
if(!file.exists(origin_clinical_name)){
  clinical <- GDCquery_clinic(project = tumor_type, type = "clinical")
  write.csv(clinical, origin_clinical_name, row.names = F)
}
message("Clinical information has been successfully downloaded")


####################################################  
#   omics-data layer: SummarizedExperiment Object  #
#################################################### 
OmicsData <- get_OmicsData(project = tumor_type, Outdir = data_type)
if(!dir.exists(paste0(current_dir, "/Omics/"))){
  dir.create(paste0(current_dir, "/Omics/"))
}
OmicsData_filename <- paste0(current_dir, "/Omics/", tumor_type, "_", data_type, ".RDS")                         
saveRDS(OmicsData, file = OmicsData_filename)
message("Omics data also has been successfully downloaded")

最终获得了临床表型和表达谱数据：

病人的数据（临床数据）：TCGA-LIHC_clinical_origin.csv
表达数据（测序数据）：TCGA-LIHC.htseq_counts.tsv/TCGA-LIHC.htseq_fpkm.tsv

7.2.1 TCGA-LIHC临床表型

本节将生成临床患者临床数据以及样本对应的表型数据。

7.2.1.1 加载R包

使用rm(list = ls())清空环境内变量

library(tidyverse)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#1F78B4", "#EE2B2B")

7.2.1.2 导入数据

TCGALIHC_metadata <- read.csv("./data/TCGA_LIHC/TCGA-LIHC_clinical_origin.csv")

7.2.1.3 数据清洗

重命名列名


TCGALIHC_clin_columns <- c("submitter_id", "race", 
             "gender", "vital_status", "age_at_index",
             "ajcc_pathologic_stage", "days_to_last_follow_up",

head(TCGALIHC_clinic_data)

7.2.1.4 输出结果

将上述结果以csv格式输出到结果目录

if (!dir.exists("./data/result/metadata/")) {
  dir.create("./data/result/metadata/", recursive = TRUE)
}

write.csv(TCGALIHC_clinic_data, "./data/result/metadata/TCGA-LIHC-clinical.csv", row.names = F)

7.2.2 TCGA-LIHC转录组

基因表达数据需要转换基因ID以及过滤冗余基因

7.2.2.1 加载R包

library(tidyverse)
library(data.table)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

7.2.2.2 导入数据


TCGALIHC_genexp <- fread("./data/TCGA_LIHC/TCGA-LIHC.htseq_counts.tsv")
TCGALIHC_genexp_fpkm <- fread("./data/TCGA_LIHC/TCGA-LIHC.htseq_fpkm.tsv")
TCGALIHC_geneID <- fread("./data/GeneIDMap/human_gene_all.tsv")

7.2.2.3 数据清洗

colnames(TCGALIHC_genexp) <- gsub("-", "_", colnames(TCGALIHC_genexp))
colnames(TCGALIHC_genexp) <- gsub("_01A", "", colnames(TCGALIHC_genexp))
TCGALIHC_gene_data <- TCGALIHC_genexp %>%
  dplyr::select(-all_of(grep("11A", colnames(TCGALIHC_genexp), value = T))) %>%
  dplyr::rename(gene_id = Ensembl_ID) %>%
  dplyr::mutate(gene_id = gsub("\\.\\d+", "", gene_id))

TCGALIHC_fpkm_df_final <- TCGALIHC_gene_fpkm_data %>%
  dplyr::left_join(TCGALIHC_geneID %>%
                      dplyr::select(ensembl_gene_id, external_gene_name) %>%
                      dplyr::distinct(),
                    by = c("gene_id" = "ensembl_gene_id")) %>%
  dplyr::select(-gene_id) %>%
  dplyr::rename(GeneName = external_gene_name) %>%  
  dplyr::select(GeneName, everything()) %>%
  dplyr::filter(!is.na(GeneName)) %>%
  dplyr::distinct()
head(TCGALIHC_fpkm_df_final[, 1:6])

7.2.2.4 过滤基因

TCGALIHC_counts_df_trim <- get_trimGene_TCGALIHC(input = TCGALIHC_counts_df_final)
TCGALIHC_fpkm_df_trim <- get_trimGene_TCGALIHC(input = TCGALIHC_fpkm_df_final)

dim(TCGALIHC_fpkm_df_trim)

7.2.2.5 表达值转换成count abundance

#| warning: false
#| message: false

TCGALIHC_counts_df_trim_final <- apply(TCGALIHC_counts_df_trim, 2, function(x) {
  return(2^x - 1)
})

head(TCGALIHC_counts_df_trim_final[, 1:5])

7.2.2.6 输出结

输出结果: 将上述结果以tsv格式输出到结果目录

if (!dir.exists("./data/result/profile/")) {
  dir.create("./data/result/profile/", recursive = TRUE)
}

write.table(TCGALIHC_counts_df_trim, "./data/result/profile/TCGA-LIHC_gene_counts.tsv", 
            sep = "\t", quote = F, row.names = T)
write.table(TCGALIHC_counts_df_trim_final, "./data/result/profile/TCGA-LIHC_gene_Truecounts.tsv", 
            sep = "\t", quote = F, row.names = T)
write.table(TCGALIHC_fpkm_df_trim, "./data/result/profile/TCGA-LIHC_gene_fpkm.tsv", 
            sep = "\t", quote = F, row.names = T)

7.3 GSE14520

最终获得了临床表型和表达谱数据：

病人的数据（临床数据）：GSE14520_clinical_post.csv/GSE14520_Extra_Supplement.txt
表达数据（测序数据）：GSE14520_profile_post.tsv

7.3.1 GSE14520临床表型

本节将生成临床患者临床数据以及样本对应的表型数据。

7.3.1.1 加载R包

加载R包：使用rm(list = ls())清空环境内变量

library(tidyverse)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#1F78B4", "#EE2B2B")

7.3.1.2 导入数据

GSE14520_metadata <- read.csv("./data/GSE14520/GSE14520_process/GSE14520_clinical_post.csv")
GSE14520_sup_info <- read.table("./data/GSE14520/GSE14520_Extra_Supplement.txt", sep = "\t", header = T)

7.3.1.3 数据清洗

数据清洗

GSE14520_clin_columns <- c("Barcode", "Disease.state", "Individual", 
      "Tissue", "Gender", "Age", "TNM.staging", 
      "Survival.status","Survival.months")

head(GSE14520_clinic_data)

7.3.1.4 输出结果

输出结果: 将上述结果以csv格式输出到结果目录

if (!dir.exists("./data/result/metadata/")) {
  dir.create("./data/result/metadata/", recursive = TRUE)
}

write.csv(GSE14520_clinic_data, "./data/result/metadata/GSE14520-clinical.csv", row.names = F)

7.3.2 GSE14520转录组

基因表达数据需要转换基因ID以及过滤冗余基因

7.3.2.1 加载R包

加载R包

library(tidyverse)
library(data.table)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

7.3.2.2 导入数据

GSE14520_genexp <- fread("./data/GSE14520/GSE14520_process/GSE14520_profile_post.tsv")

7.3.2.3 数据清洗

数据清洗

GSE14520_counts_df_final <- GSE14520_genexp %>%
  dplyr::rename(GeneName = GeneID) %>%  
  dplyr::select(GeneName, everything()) %>%
  dplyr::filter(!is.na(GeneName)) %>%
  dplyr::distinct()

head(GSE14520_counts_df_final[, 1:6])

7.3.2.4 过滤基因

GSE14520_counts_df_trim <- get_trimGene_GSE14520(input = GSE14520_counts_df_final)

dim(GSE14520_counts_df_trim)

7.3.2.5 输出结果

输出结果: 将上述结果以tsv格式输出到结果目录


if (!dir.exists("./data/result/profile/")) {
  dir.create("./data/result/profile/", recursive = TRUE)
}

write.table(GSE14520_counts_df_trim, "./data/result/profile/GSE14520_gene_counts.tsv", 
            sep = "\t", quote = F, row.names = T)

7.4 总结

针对LIRI-JP，LIHC-US/TCGA-LIHC和GSE14520这三个数据集的临床表型和表达谱数据，进行了详尽的数据预处理工作。经过清洗、整合和筛选，成功去除了冗余信息、修正了潜在错误，并确保了数据的完整性和准确性。最终，得到了符合后续分析要求的高质量输入文件。

为了更高效地利用这些数据，决定将它们存储为常见的ExpressionSet数据对象（见章节 8）。ExpressionSet是生物信息学分析中常用的数据结构，它能够同时容纳表达谱数据和与之相关的临床表型信息。通过将数据转换为ExpressionSet格式，可以方便地访问和操作数据，为后续的统计分析、可视化以及模型构建等步骤提供便利。这一步骤的完成，为后续的数据分析奠定了坚实的基础。

系统信息

sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Asia/Shanghai
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices datasets  utils     methods   base     

other attached packages:
 [1] data.table_1.15.4 lubridate_1.9.3   forcats_1.0.0     stringr_1.5.1    
 [5] dplyr_1.1.4       purrr_1.0.2       readr_2.1.5       tidyr_1.3.1      
 [9] tibble_3.2.1      ggplot2_3.5.1     tidyverse_2.0.0  

loaded via a namespace (and not attached):
 [1] gtable_0.3.5        jsonlite_1.8.8      compiler_4.3.3     
 [4] BiocManager_1.30.23 renv_1.0.0          tidyselect_1.2.1   
 [7] scales_1.3.0        yaml_2.3.8          fastmap_1.1.1      
[10] R6_2.5.1            generics_0.1.3      knitr_1.46         
[13] htmlwidgets_1.6.4   munsell_0.5.1       tzdb_0.4.0         
[16] pillar_1.9.0        rlang_1.1.3         utf8_1.2.4         
[19] stringi_1.8.4       xfun_0.43           timechange_0.3.0   
[22] cli_3.6.2           withr_3.0.0         magrittr_2.0.3     
[25] digest_0.6.35       grid_4.3.3          rstudioapi_0.16.0  
[28] hms_1.1.3           lifecycle_1.0.4     vctrs_0.6.5        
[31] evaluate_0.23       glue_1.7.0          fansi_1.0.6        
[34] colorspace_2.1-0    rmarkdown_2.26      tools_4.3.3        
[37] pkgconfig_2.0.3     htmltools_0.5.8.1