library(tidyverse)
library(data.table)
library(Biobase)
library(randomForest)
library(caret)
library(mlbench)
library(Boruta)
library(pROC)
library(Hmisc)
library(MLmetrics)
rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)
grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#8AC786", "#B897CA")
grp_shapes <- c(15, 16)
20 Boruta RF
采用了Boruta结合RF(Random Forest)的方法,对差异基因(参考 章节 11) 进行了特征筛选。
20.1 加载R包
使用rm(list = ls())
来清空环境中的所有变量。
20.2 导入数据
20.3 准备数据
- 差异基因:结果解析见 表 11.1
signif_DEG <- da_res %>%
dplyr::filter(Enrichment != "Nonsignif") %>%
dplyr::slice(-grep("\\.", FeatureID))
head(signif_DEG[, 1:6])
- 基因表达谱:行名是样本,列名是基因ID
profile <- exprs(ExprSet) %>%
as.data.frame()
rownames(profile) <- make.names(rownames(profile))
head(profile_DEG[, 1:6])
- 去除共线性特征
在机器学习中,数据预处理中使用去除共线性特征的方法,其主要目的是为了提升模型的性能、稳定性和准确性。以下是具体的几个原因:
这里通过Hmisc::rcorr
(Harrell Jr 和 Harrell Jr 2019)采用了相关系数分析去除共线性。
if (file.exists("./data/result/ML/RF/HCC_RF_RM_profile.tsv")) {
profile_remain <- fread("./data/result/ML/RF/HCC_RF_RM_profile.tsv", sep = "\t") %>%
tibble::column_to_rownames("V1")
} else {
profile_remain <- profile_DEG %>%
dplyr::select(all_of(rownames(feature_remain)))
write.table(profile_remain, "./data/result/ML/RF/HCC_RF_RM_profile.tsv",
row.names = T, sep = "\t", quote = F)
}
print(dim(profile_remain))
- 临床表型表:包含分组等信息
- 合并数据
MergeData <- metadata %>%
dplyr::select(SampleID, Group) %>%
dplyr::inner_join(profile_remain %>%
tibble::rownames_to_column("SampleID"),
by = "SampleID") %>%
tibble::column_to_rownames("SampleID") %>%
dplyr::mutate(Group = recode(Group, "Early Stage" = "Early",
"Late Stage" = "Late")) %>%
dplyr::mutate(Group = factor(Group))
head(MergeData[, 1:6])
20.4 机器学习特征筛选
特征筛选步骤:
- 数据分割:
将原始数据集划分为训练集、验证集和测试集。通常,训练集用于模型训练,验证集用于调整超参数和选择最佳模型,测试集用于评估最终模型的性能。 划分比例可以根据数据集的大小和特性进行调整,但一般常见的比例是训练集占60%-80%,验证集和测试集各占10%-20%。
- 数据转换:
数据清洗:处理缺失值、异常值、重复值等。对于缺失值,可以使用均值、中位数、众数等填充,或采用插值法、机器学习预测等方法进行填充。
特征编码:对于分类变量,需要进行编码以便模型能够处理。常见的编码方式包括独热编码(One-Hot Encoding)、标签编码(Label Encoding)等。
特征离散化:将连续特征转换为离散类别,例如通过分箱操作。这有助于处理一些具有非线性关系的特征。
特征标准化或归一化:对于不同量纲或不同分布的特征,需要进行标准化或归一化,以便模型能够更好地处理它们。
- 特征筛选:
相关性分析:计算每个特征与目标变量之间的相关性,如皮尔逊相关系数或斯皮尔曼等级相关系数。通过相关性分析,可以初步筛选出与目标变量强相关的特征。
树模型重要性:利用决策树、随机森林等树模型评估特征的重要性。这些模型可以根据特征在树中分裂的贡献度来评估特征的重要性。
启发式搜索策略:如前向序列选择方法,从空的候选集合出发,逐步添加与目标变量相关性最强的特征。
全局最优搜索策略:如穷举法或分支界定法,从所有可能的特征组合中挑选出表现最优的特征子集。
训练集构建模型:使用经过特征筛选的训练集数据构建机器学习模型。选择合适的机器学习算法,如逻辑回归、支持向量机、决策树、随机森林、神经网络等。
调参:使用验证集对模型进行调参,优化模型的超参数。超参数是机器学习算法中需要人为设定的参数,如学习率、迭代次数、正则化参数等。 可以采用网格搜索(Grid Search)、随机搜索(Random Search)等方法进行调参。
测试集评估模型:使用测试集对经过调参的模型进行评估,计算模型的性能指标,如准确率、精确率、召回率、F1分数、AUC-ROC等。根据评估结果选择性能最佳的模型作为最终模型。
20.4.1 数据分割
在机器学习的实践中,数据分割是一个至关重要的步骤,用于将原始数据集划分为训练集和测试集(使用了caret::createDataPartition
(Kuhn 2008))。
20.4.2 基础模型
使用randomForest
函数(Liaw, Wiener, 等 2002)构建随机森林的基础模型,获得特征的重要性得分矩阵。
- 特征的重要性得分
20.4.3 Boruta特征筛选
Boruta特征筛选(使用Boruta (Kursa 和 Rudnicki 2010) R包)是一种基于随机森林的特征选择方法,旨在从给定的特征集合中找到真正重要的特征,并区分无关的特征。它的核心思想是通过创建随机生成的“影子”特征(shadow features)来模拟随机选择的特征,并将这些影子特征与原始特征进行比较,以确定每个原始特征的重要性。
if (!file.exists("./data/result/ML/RF/HCC_RF_Boruta.RData")) {
set.seed(123)
bank_df <- Boruta::attStats(boruta.bank)
print(bank_df)
if (!dir.exists("./data/result/ML/RF")) {
dir.create("./data/result/ML/RF", recursive = TRUE)
}
save(X_train, y_train, X_test, y_test,
fs_Boruta, boruta.bank, bank_df,
file = "./data/result/ML/RF/HCC_RF_Boruta.RData")
} else {
load("./data/result/ML/RF/HCC_RF_Boruta.RData")
}
plot(boruta.bank, xlab = "", xaxt = "n")
lz <- lapply(1:ncol(boruta.bank$ImpHistory), function(i) {
boruta.bank$ImpHistory[is.finite(boruta.bank$ImpHistory[, i]), i]
})
names(lz) <- colnames(boruta.bank$ImpHistory)
Labels <- sort(sapply(lz, median))
axis(side = 1, las = 2, labels = names(Labels),
at = 1:ncol(boruta.bank$ImpHistory), cex.axis = 0.7)
trainData_select <- trainData %>%
dplyr::select(all_of(c("Group", rownames(feature_Boruta))))
X_train_select <- trainData_select[, -1]
y_train_select <- trainData_select[, 1]
testData_select <- testData %>%
dplyr::select(all_of(c("Group", rownames(feature_Boruta))))
X_test_select <- testData_select[, -1]
y_test_select <- testData_select[, 1]
20.4.4 调参
为了优化随机森林模型的性能,采用了重复交叉验证(repeated cross-validation)
的方法来调整其两个关键参数:mtry(每棵树中随机选择的特征数)和ntree(树的总数)。在调参过程中,设定了不同的mtry和ntree参数组合,并使用重复交叉验证来评估每个组合下的模型性能。这种方法通过多次重复地划分训练集和验证集,并对每个参数组合进行多次评估,从而获得了更加稳定和可靠的性能指标。最终,根据这些性能指标,选择出能够使模型性能达到最优的mtry和ntree参数值。
set.seed(123)
## Print the best tuning parameter that maximizes model accuracy
optimalVar <- data.frame(tune_fit$results[which.max(tune_fit$results[, 3]), ])
# print(optimalVar)
## Plot model accuracy vs different values of Cost
print(plot(tune_fit))
- 最佳特征:通过重复运行获得误差确定最佳特征数目
n.var <- as.numeric(rownames(error.cv))
colnames(error.cv) <- paste("error", 1:5, sep = ".")
err.mean <- apply(error.cv, 1, mean)
err.df <- data.frame(num = n.var,
err.mean = err.mean,
error.cv)
head(err.df[, 1:6])
- 最佳特征可视化
optimal <- err.df$num[which(err.df$err.mean == min(err.df$err.mean))]
main_theme <-
theme(
panel.background = element_blank(),
panel.grid = element_blank(),
axis.line.x = element_line(linewidth = 0.5, color = "black"),
axis.line.y = element_line(linewidth = 0.5, color = "black"),
axis.ticks = element_line(color = "black"),
axis.title = element_text(size = 11, color = "black", face = "bold"),
axis.text = element_text(color = "black", size = 10),
legend.position = "right",
legend.background = element_blank(),
legend.key = element_blank(),
legend.text = element_text(size = 12),
text = element_text(family = "serif"))
pl <-
ggplot(data = err.df, aes(x = err.df$num)) +
geom_line(aes(y = err.df$error.1), color = "grey", linewidth = 0.5) +
geom_line(aes(y = err.df$error.2), color = "grey", linewidth = 0.5) +
geom_line(aes(y = err.df$error.3), color = "grey", linewidth = 0.5) +
geom_line(aes(y = err.df$error.4), color = "grey", linewidth = 0.5) +
geom_line(aes(y = err.df$error.5), color = "grey", linewidth = 0.5) +
geom_line(aes(y = err.df$err.mean), color = "black", linewidth = 0.5) +
geom_vline(xintercept = optimal, color = "red", lwd = 0.36, linetype = 2) +
coord_trans(x = "log2") +
scale_x_continuous(breaks = c(1, 5, 10, 20, 30)) +
labs(x = "Number of Features ", y = "Cross-validation error rate") +
annotate("text",
x = optimal - 6,
y = max(err.df$err.mean),
label = paste("Optimal = ", optimal, sep = ""),
color = "red") +
main_theme
pl
20.4.5 最终分类模型
在已经确定了mtry和ntree这两个关键参数的最优值,并选择了合适的特征数目之后,将这些参数和特征应用到随机森林模型的构建中。
selected_biomarker <- imp_biomarker %>%
dplyr::filter(FeatureID %in% rownames(feature_Boruta)) %>%
dplyr::slice(1:optimal)
selected_columns <- c("Group", selected_biomarker$FeatureID)
trainData_optimal <- trainData_select %>%
dplyr::select(all_of(selected_columns))
testData_optimal <- testData_select %>%
dplyr::select(all_of(selected_columns))
set.seed(123)
rf_fit_optimal
结果:相比基础模型来说,最终模型降低了错误率。
20.4.6 测试集验证
- 混淆矩阵
- AUROC曲线:使用
pROC::roc
(Robin 等 2011)函数
AUROC <- function(
DataTest,
PredProb = pred_prob,
nfeature) {
# plot
pl <- ggplot(data = roc, aes(x = fpr, y = tpr)) +
geom_path(color = "red", size = 1) +
geom_abline(intercept = 0, slope = 1,
color = "grey", linewidth = 1, linetype = 2) +
labs(x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate",
title = paste0("AUROC (", nfeature, " Features)")) +
annotate("text",
x = 1 - rocbj_df$specificities[max_value_row] + 0.15,
y = rocbj_df$sensitivities[max_value_row] - 0.05,
label = paste0(threshold, " (",
rocbj_df$specificities[max_value_row], ",",
rocbj_df$sensitivities[max_value_row], ")"),
size=5, family="serif") +
annotate("point",
x = 1 - rocbj_df$specificities[max_value_row],
y = rocbj_df$sensitivities[max_value_row],
color = "black", size = 2) +
annotate("text",
x = .75, y = .25,
label = roc_CI_lab,
size = 5, family = "serif") +
coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +
theme_bw() +
theme(panel.background = element_rect(fill = "transparent"),
plot.title = element_text(size = 12, color = "black", face = "bold"),
axis.title = element_text(size = 11, color = "black", face = "bold"),
axis.text = element_text(size= 10, color = "black"),
axis.ticks.length = unit(0.4, "lines"),
axis.ticks = element_line(color = "black"),
axis.line = element_line(size = .5, color = "black"),
text = element_text(size = 8, color = "black", family = "serif"))
res <- list(rocobj = rocobj,
roc_CI = roc_CI_lab,
roc_pl = pl)
return(res)
}
AUROC_res <- AUROC(
DataTest = testData_select,
PredProb = pred_prob,
nfeature = optimal)
AUROC_res$roc_pl
- AUPRC曲线:使用
pROC::roc
(Robin 等 2011)函数
AUPRC <- function(DataTest, PredProb, nfeature) {
# plot
pl <- ggplot(data = prc, aes(x = recall, y = precision)) +
geom_path(color = "red", size = 1) +
labs(x = "Recall",
y = "Precision",
title = paste0("AUPRC (", nfeature, " Features)")) +
coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +
theme_bw() +
theme(panel.background = element_rect(fill = "transparent"),
plot.title = element_text(color = "black", size = 14, face = "bold"),
axis.ticks.length = unit(0.4, "lines"),
axis.ticks = element_line(color = "black"),
axis.line = element_line(size = .5, color = "black"),
axis.title = element_text(color = "black", size = 12, face = "bold"),
axis.text = element_text(color = "black", size = 10),
text = element_text(size = 8, color = "black", family = "serif"))
res <- list(dat_PR = dat_PR,
PC_pl = pl)
return(res)
}
AUPRC_res <- AUPRC(
DataTest = testData_select,
PredProb = pred_prob,
nfeature = optimal)
AUPRC_res$PC_pl
- 模型评估参数:使用
pROC::roc
(Robin 等 2011)和MLmetrics::PRAUC
(Yan 2016)函数
Evaluate_index <- function(DataTest, PredProb, label, PredRaw) {
# ROC object
rocobj <- roc(DataTest$Group, PredProb[, 1])
# confusionMatrix
con_matrix <- table(PredRaw, DataTest$Group)
threshold <- rocbj_df$threshold[max_value_row]
sen <- round(TP / (TP + FN), 3) # caret::sensitivity(con_matrix)
spe <- round(TN / (TN + FP), 3) # caret::specificity(con_matrix)
acc <- round((TP + TN) / (TP + TN + FP + FN), 3) # Accuracy
pre <- round(TP / (TP + FP), 3) # precision
rec <- round(TP / (TP + FN), 3) # recall
#F1S <- round(2 * TP / (TP + TN + FP + FN + TP - TN), 3)# F1-Score
F1S <- round(2 * TP / (2 * TP + FP + FN), 3)# F1-Score
youden <- sen + spe - 1 # youden index
# AUROC
AUROC <- round(as.numeric(auc(DataTest$Group, PredProb[, 1])), 3)
# AUPRC
AUPRC <- round(MLmetrics::PRAUC(y_pred = PredProb[, 1],
y_true = DataTest$Group), 3)
index_df <- data.frame(Index = c("Threshold", "Sensitivity",
"Specificity", "Accuracy",
"Precision", "Recall",
"F1 Score", "Youden index",
"AUROC", "AUPRC"),
Value = c(threshold, sen, spe,
acc, pre, rec, F1S,
youden, AUROC, AUPRC)) %>%
stats::setNames(c("Index", label))
return(index_df)
}
Evaluate_index(
DataTest = testData_select,
PredProb = pred_prob,
label = group_names[1],
PredRaw = pred_raw)
结果:从表中我们可以看到以下指标及其对应的数值:
阈值(Threshold):0.585。阈值用于将模型的预测结果转换为具体的类别(例如,在二分类问题中,通常将预测概率大于阈值的样本视为正类,小于阈值的视为负类)。
灵敏度(Sensitivity):0.861。也称为真正率(True Positive Rate, TPR)或召回率(Recall),它表示所有实际为正样本的样本中,被模型正确预测为正样本的比例。
特异度(Specificity):0.429。也称为真负率(True Negative Rate, TNR),它表示所有实际为负样本的样本中,被模型正确预测为负样本的比例。
准确率(Accuracy):0.728。它表示模型在所有样本上的正确分类的比例。
精度(Precision):0.773。它表示所有被模型预测为正样本的样本中,实际为正样本的比例。
召回率(Recall):0.861(注意这个与Sensitivity的值是一样的,因为在二分类问题中,Sensitivity和Recall是等价的)。
F1得分(F1Score):0.814。它是精度和召回率的调和平均值,用于综合衡量两者的表现。
Youden指数:0.290。它是一个用于评估诊断测试性能的指标,等于灵敏度与特异度之和减去1。
AUROC(Area Under the Receiver Operating Characteristic Curve):0.704。AUROC是一种衡量分类模型性能的指标,它表示ROC曲线下的面积。
AUPRC(Area Under the Precision-Recall Curve):0.220。AUPRC是另一种衡量分类模型性能的指标,特别适用于正样本数量远少于负样本的情况,它表示Precision-Recall曲线下的面积。
20.5 标记基因
通过Boruta特征筛选和随机森林交叉误差率,模型能够自动筛选出那些对预测结果具有显著影响的特征,因此被称为标记基因或显著特征。
optimal_feature <- imp_biomarker %>%
dplyr::filter(FeatureID %in% rownames(feature_Boruta)) %>%
dplyr::slice(1:optimal) %>%
dplyr::select(FeatureID, MeanDecreaseAccuracy) %>%
dplyr::arrange(MeanDecreaseAccuracy) %>%
dplyr::mutate(FeatureID = forcats::fct_inorder(FeatureID))
head(optimal_feature)
- 可视化特征
optimal <- 39
optimal_feature_pl <- imp_biomarker %>%
dplyr::filter(FeatureID %in% rownames(feature_Boruta)) %>%
dplyr::slice(1:optimal) %>%
dplyr::select(FeatureID, MeanDecreaseAccuracy) %>%
dplyr::arrange(MeanDecreaseAccuracy) %>%
dplyr::mutate(FeatureID = forcats::fct_inorder(FeatureID)) %>%
ggplot(aes(x = FeatureID, y = MeanDecreaseAccuracy))+
geom_bar(stat = "identity", fill = "white", color = "blue") +
labs(x = "", y = "Mean decrease accuracy") +
coord_flip() +
main_theme
optimal_feature_pl
- 提取特征的表达谱
20.6 输出结果
if (!dir.exists("./data/result/ML/RF")) {
dir.create("./data/result/ML/RF", recursive = TRUE)
}
write.csv(optimal_feature, "./data/result/ML/RF/HCC_RF_feature.csv", row.names = F)
write.table(profile_RF, "./data/result/ML/RF/HCC_RF_profile.tsv", row.names = T, sep = "\t", quote = F)
save(AUROC_res, file = "./data/result/ML/RF/HCC_RF_AUROC.RData")
if (!dir.exists("./data/result/Figure/")) {
dir.create("./data/result/Figure/", recursive = TRUE)
}
pdf("./data/result/Figure/SFig3-D.pdf", width = 5, height = 4)
plot(boruta.bank, xlab = "", xaxt = "n")
lz <- lapply(1:ncol(boruta.bank$ImpHistory), function(i) {
boruta.bank$ImpHistory[is.finite(boruta.bank$ImpHistory[, i]), i]
})
names(lz) <- colnames(boruta.bank$ImpHistory)
Labels <- sort(sapply(lz, median))
axis(side = 1, las = 2, labels = names(Labels),
at = 1:ncol(boruta.bank$ImpHistory), cex.axis = 0.7)
dev.off()
ggsave("./data/result/Figure/SFig3-E.pdf", AUROC_res$roc_pl, width = 5, height = 4, dpi = 600)
ggsave("./data/result/Figure/SFig3-F.pdf", optimal_feature_pl, width = 5, height = 5, dpi = 600)
20.7 总结
经过对差异基因进行Boruta特征筛选,成功识别出一组与特定生物学现象或条件紧密相关的基因子集。接着,利用随机森林模型进行进一步的特征选择,以找到在预测性能方面表现最佳的模型特征向量。通过这一详尽的模型训练和评估过程,最终确定了39个特征,这些特征在预测任务中展现出了相对不错的性能。这39个特征将被用于后续的分析,以进一步揭示这些差异基因在生物学过程中的作用机制和潜在价值。
系统信息
R version 4.3.3 (2024-02-29)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.2
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: Asia/Shanghai
tzcode source: internal
attached base packages:
[1] stats graphics grDevices datasets utils methods base
other attached packages:
[1] MLmetrics_1.1.3 Hmisc_5.1-2 pROC_1.18.5
[4] Boruta_8.0.0 mlbench_2.1-5 caret_6.0-94
[7] lattice_0.22-6 randomForest_4.7-1.1 Biobase_2.62.0
[10] BiocGenerics_0.48.1 data.table_1.15.4 lubridate_1.9.3
[13] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4
[16] purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
[19] tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] tidyselect_1.2.1 timeDate_4032.109 fastmap_1.1.1
[4] digest_0.6.35 rpart_4.1.23 timechange_0.3.0
[7] lifecycle_1.0.4 cluster_2.1.6 survival_3.7-0
[10] magrittr_2.0.3 compiler_4.3.3 rlang_1.1.3
[13] tools_4.3.3 utf8_1.2.4 yaml_2.3.8
[16] knitr_1.46 htmlwidgets_1.6.4 plyr_1.8.9
[19] foreign_0.8-86 withr_3.0.0 nnet_7.3-19
[22] grid_4.3.3 stats4_4.3.3 fansi_1.0.6
[25] colorspace_2.1-0 future_1.33.2 globals_0.16.3
[28] scales_1.3.0 iterators_1.0.14 MASS_7.3-60.0.1
[31] cli_3.6.2 rmarkdown_2.26 generics_0.1.3
[34] rstudioapi_0.16.0 future.apply_1.11.2 reshape2_1.4.4
[37] tzdb_0.4.0 splines_4.3.3 parallel_4.3.3
[40] BiocManager_1.30.23 base64enc_0.1-3 vctrs_0.6.5
[43] hardhat_1.3.1 Matrix_1.6-5 jsonlite_1.8.8
[46] hms_1.1.3 htmlTable_2.4.2 Formula_1.2-5
[49] listenv_0.9.1 foreach_1.5.2 gower_1.0.1
[52] recipes_1.0.10 glue_1.7.0 parallelly_1.37.1
[55] codetools_0.2-19 stringi_1.8.4 gtable_0.3.5
[58] munsell_0.5.1 pillar_1.9.0 htmltools_0.5.8.1
[61] ipred_0.9-14 lava_1.8.0 R6_2.5.1
[64] evaluate_0.23 backports_1.4.1 renv_1.0.0
[67] class_7.3-22 Rcpp_1.0.12 checkmate_2.3.1
[70] gridExtra_2.3 nlme_3.1-164 prodlim_2023.08.28
[73] xfun_0.43 pkgconfig_2.0.3 ModelMetrics_1.2.2.2