28 单细胞细胞识别

单细胞细胞簇识别有两种方式自动识别和手动识别，可以先通过自动识别后再手动纠正细胞类群。

手动注释：通过查询一些常见的细胞类型的标记基因
自动细胞注释网站和软件:

两者相辅相成对聚类结果进行细胞注释。

28.1 加载R包

使用rm(list = ls())来清空环境中的所有变量。

library(tidyverse)
library(Seurat)
library(cowplot)
library(pheatmap)
library(ggh4x)
library(ggunchull)
library(scCATCH)
library(SingleR)
library(UCell)
library(patchwork)
library(ggtree)

rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)

grp_names <- c("Early Stage", "Late Stage")
grp_colors <- c("#8AC786", "#B897CA")

cell_types <- c("Hepatocyte", "T/NK", 
               "Myeloid", "B",
               "Endothelial", "Fibroblast")

cell_colors <- c("#A01FF0", "#1F78B4", "#4EB29D", 
                 "#DA3F4C", "#F1EE97", "#08306B")

28.2 导入数据

seurat_obj数据来自于章节 27


seurat_obj <- readRDS("./data/result/scRNA/Seurat_cluster.RDS")

seurat_obj

28.3 细胞簇的标记基因

细胞簇的标记基因是表征不同细胞簇的特异性基因，这些基因在特定细胞簇中表现出独特的表达模式，从而成为区分不同细胞簇的重要特征。


if (file.exists("./data/result/scRNA/All_Markers_raw.csv")) {
  all_markers_raw <- read.csv("./data/result/scRNA/All_Markers_raw.csv")
} else {
  # 找出每个细胞簇的标记物，与所有剩余的细胞进行比较，只报告阳性细胞
  all_markers_raw <- FindAllMarkers(
    object = seurat_obj,
    only.pos = TRUE,
    min.pct = 0.25,
    logfc.threshold = 0.25,    
    verbose = FALSE)
  
  write.csv(all_markers_raw, "./data/result/scRNA/All_Markers_raw.csv", row.names = FALSE)  
}


topN_markers <- all_markers_raw %>%
  dplyr::group_by(cluster) %>%
  dplyr::top_n(n = 5, wt = avg_log2FC)

# DoHeatmap(object = seurat_obj, 
#           features = top10_markers$gene) + 
#   NoLegend() +
#   scale_fill_gradientn(colors = c("white","grey","firebrick3"))

head(topN_markers)

28.4 ATC细胞注释

生成ATC网站需要的输入文件。


topN_markers_label <- topN_markers %>%
  dplyr::group_by(cluster) %>%
  dplyr::summarise(label = paste0("Cluster", cluster, ":", paste(gene, collapse = ","))) %>%
  dplyr::distinct() %>%
  dplyr::ungroup() 

# cat(paste(topN_markers_label$label, collapse = "\n"))

结果：在Annotation of Cell Types，ATC选择好Human和liver后，每次输入的簇数目必须小于50。注释结果不够理想，很多cluster无法注释出结果，比如cluster0没有注释到细胞类群。

28.5 scCATCH细胞注释

scCATCH(Shao 等 2020)是浙江大学团队在2020年发表的自动注释R包，它提供findmarkergene和findcelltype函数用于自动注释。通过查看数据中的组织和癌症列表判断是否适合自己的数据。

sc_data <- GetAssayData(seurat_obj, assay = "RNA", layer = "scale.data")
sc_cluster <- FetchData(object = seurat_obj, vars = "ident")

obj <- createscCATCH(data = sc_data, cluster = as.character(sc_cluster$ident))

# find marker gene for each cluster
obj <- findmarkergene(
  object = obj, 
  species = "Human",
  cluster = "All",
  marker = cellmatch, 
  use_method = "1",
  tissue = "Liver", 
  cancer = "Liver Cancer",
  cell_min_pct = 0.25,
  logfc = 0.25,
  pvalue = 0.05,
  verbose = TRUE)

# find cell type for each cluster
obj <- findcelltype(obj)

结果：代码运行错误，报错信息No potential marker genes found in the matrix! You may adjust the cell clusters by applying other clustering algorithm and try again。

28.6 SingleR细胞注释

SingleR自建数据集，细胞注释。metadata已经包含文章提供的细胞注释信息，我们直接从seurat_obj构建数据库文件。

seurat_SGR <- seurat_obj

pred_label <- SingleR(
  test = GetAssayData(object = seurat_obj, 
                      assay = "RNA",
                      layer = "data"), 
  ref = SGT_ref,
  labels = SGT_ref$CellType)

plotScoreHeatmap(pred_label)

seurat_SGR_label <- seurat_obj
seurat_SGR_label$singleR <- pred_label$labels
seurat_SGR_label$singleR <- factor(seurat_SGR_label$singleR,
                                   levels = cell_types)

DimPlot(seurat_SGR_label,
        reduction = "tsne",
        label = FALSE,
        group.by = "singleR",        
        cols = cell_colors,
        pt.size = 0.5)

结果：和小节 28.7 的结果还是存在部分差异，说明该方法自动注释效果还是不太行。

28.7 内部细胞注释

seurat_obj@meta.data$celltype列已经包含了公开数据提供的细胞注释信息，我们可以直接使用该信息画图即可。


tSNE_columns <- c("celltype", "tSNE_1", "tSNE_2")
tSNE_data <- FetchData(
  object = seurat_obj,
  vars = tSNE_columns) %>%
  dplyr::mutate(celltype = factor(celltype, levels = cell_types))

tSNE_axis <- ggh4x::guide_axis_truncated(
    trunc_lower = unit(0, "npc"),
    trunc_upper = unit(3, "cm"))

tSNE_pl <- ggplot(tSNE_data, aes(x = tSNE_1, y = tSNE_2, fill = celltype, color = celltype)) +
    geom_point(size = 0.5, show.legend = FALSE) +
    guides(color = "none", fill = guide_legend(title = NULL), 
           x = tSNE_axis, y = tSNE_axis) +
    scale_fill_manual(values = cell_colors) +
    scale_color_manual(values = cell_colors) +  
    scale_x_continuous(breaks = NULL) +
    scale_y_continuous(breaks = NULL) +  
    theme(aspect.ratio = 1,
         panel.background = element_blank(),
         panel.grid = element_blank(),
         axis.line = element_line(arrow = arrow(type = "closed")),
         axis.title = element_text(hjust = 0.05, face = "italic"))

tSNE_pl

结果：在tsne图中，同一细胞类群没有聚集在一起，这和文章提供的图表现不一致，原因是我们仅仅保留了tumor患者的细胞。

修改默认identity为细胞注释结果


seurat_cell <- seurat_obj

Idents(object = seurat_cell) <- "celltype"
levels(seurat_cell) <- cell_types

DimPlot(seurat_cell,
        reduction = "tsne",
        label = FALSE,
        cols = cell_colors,
        pt.size = 0.5)

结果：从图示数据分析中观察到，部分簇内包含了多个细胞类群，这可能是由于新方法的聚类效果未能达到如先前研究那样的高度一致性。为了更准确地描述这些簇，我们需要重新对每个簇进行命名，命名规则将依据簇内出现概率最高的细胞类群来确定。

28.8 重命名簇


seurat_cell_rename <- seurat_obj

table(seurat_obj$seurat_clusters, seurat_obj$celltype)

seurat_cell_rename <- RenameIdents(
  object = seurat_cell_rename, "0" = "Hepatocyte",
  "1" = "Hepatocyte", "2" = "T/NK", "3" = "Myeloid",
  "4" = "Myeloid", "5" = "Hepatocyte", "6" = "Hepatocyte",
  "7" = "Myeloid", "8" = "Myeloid", "9" = "Hepatocyte",
  "10" = "Hepatocyte", "11" = "Hepatocyte", "12" = "Endothelial",
  "13" = "T/NK", "14" = "Hepatocyte", "15" = "Fibroblast",
  "16" = "Hepatocyte", "17" = "T/NK", "18" = "Myeloid",
  "19" = "T/NK", "20" = "T/NK", "21" = "Hepatocyte",
  "22" = "Myeloid", "23" = "Fibroblast", "24" = "Myeloid",
  "25" = "T/NK", "26" = "Hepatocyte", "27" = "Hepatocyte",
  "28" = "Endothelial", "29" = "T/NK", "30" = "Myeloid",
  "31" = "Hepatocyte", "32" = "T/NK", "33" = "Endothelial",
  "34" = "B", "35" = "B", "36" = "Myeloid",
  "37" = "Fibroblast", "38" = "Myeloid", "39" = "Hepatocyte",
  "40" = "Myeloid", "41" = "T/NK", "42" = "Myeloid",
  "43" = "Myeloid", "44" = "Myeloid", "45" = "Hepatocyte",
  "46" = "Hepatocyte", "47" = "T/NK", "48" = "Endothelial",
  "49" = "T/NK", "50" = "T/NK", "51" = "Myeloid")

levels(seurat_cell_rename) <- cell_types

DimPlot(seurat_cell_rename,
        reduction = "tsne",
        label = FALSE,
        cols = cell_colors,
        pt.size = 0.5)

28.9 展示分组基因表达

使用气泡图展示不同细胞类型的差异表达基因。



if (file.exists("./data/result/scRNA/All_Markers_celltype.csv")) {
  all_markers_cell <- read.csv("./data/result/scRNA/All_Markers_celltype.csv")
} else {
  # 找出每个细胞簇的标记物，与所有剩余的细胞进行比较，只报告阳性细胞
  # devtools::install_github('immunogenomics/presto')
  require(presto)
  all_markers_cell <- FindAllMarkers(
    object = seurat_cell_rename,
    only.pos = TRUE,
    min.pct = 0.25,
    logfc.threshold = 0.25,    
    verbose = FALSE)
  
  write.csv(all_markers_cell, "./data/result/scRNA/All_Markers_celltype.csv", row.names = FALSE)  
}

topN_markers_cell <- all_markers_cell %>%
  dplyr::group_by(cluster) %>%
  dplyr::top_n(n = 5, wt = avg_log2FC)

DotPlot_markers <- DotPlot(object = seurat_cell_rename, 
        features = topN_markers_cell$gene,
        cols = cell_colors) + 
  coord_flip() +
  labs(x = "", y = "") +
  scale_color_gradientn(colors = c('#330066', '#336699', '#66CC66', '#FFCC33')) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

ggplot(DotPlot_markers$data, aes(x = id, y = features.plot, size = pct.exp)) +
  geom_point(shape = 21,aes(fill = avg.exp.scaled), position = position_dodge(0)) +
  scale_size_continuous(range = c(1, 10)) + 
  scale_fill_gradient(low = "#E54924", high = "#498EA4") +
  labs(x = "", y = "", size = "Percent Express",
       fill = "Average Expression") +
  # guides(size = guide_legend(title = "Percent Express", order = 1),
  #        fill = guide_legend(title = "Average Expression", order = 2)) +
  theme_bw() +
  theme(legend.position = "right",
        legend.box = "vertical",
        legend.margin = margin(t = 0, unit = 'cm'),
        legend.spacing = unit(0, "in"),
        axis.text.x = element_text(color = "black", size = 11, angle = 45, 
                                    vjust = 0.5, hjust = 0.5),
        axis.text.y = element_text(color = "black", size = 10),
        legend.text = element_text(size = 10, color = "black"),
        legend.title = element_text(size = 10, color = "black"))

结果：从气泡图可以看出每个细胞类型都有自己独有的差异基因。

28.10 美化气泡图

对小节 28.9 的细胞类群的基因表达状况气泡图进行美化，体现不同的原始聚类结果。

cell_bubble_pl <- plot_spacer() + plot_spacer() + ggtree_plot_col +
    plot_spacer() + plot_spacer() + labels + 
    plot_spacer() + plot_spacer() + plot_spacer() +
    ggtree_plot + plot_spacer() + dotplot + 
    plot_spacer() + plot_spacer() + legend + 
    plot_layout(ncol = 3, widths = c(0.7, -0.1, 4), heights = c(0.9, 0.1, -0.1, 4, 1)) &
  theme(text = element_text(family = "serif"))

cell_bubble_pl

结果：气泡图

28.11 单细胞评分

单细胞评分分析UCell(Andreatta 和 Carmona 2021)R包，对细胞类群进行评分。


markers <- list()
markers$Hepatocyte <- c("SPINK1", "COX7B2", "TFF2")
markers$`T/NK` <- c("CD2", "TRAC", "CD3E")
markers$Myeloid <- c("MS4A7", "C5AR1", "FPR3")
markers$B <- c("MS4A1", "BANK1", "TNFRSF13C")
markers$Endothelial <- c("CLEC14A", "EMCN", "TM4SF18")
markers$Fibroblast <- c("COL14A1", "PLN", "MFAP4")

marker_score <- AddModuleScore_UCell(
  obj = seurat_cell_rename,
  features = markers)

FeaturePlot(object = marker_score,
            features = colnames(marker_score@meta.data) %>% 
              stringr::str_subset("_UCell"),
            reduction = "tsne",
            order = T, 
            ncol = 3,
            cols = viridis::viridis(256)) &
  theme(plot.title = element_text(size = 12),
        axis.text = element_text(size = 10),
        legend.title = element_text(size = 10))

28.12 输出结果


if (!dir.exists("./data/result/Figure/")) {
  dir.create("./data/result/Figure/", recursive = TRUE)
}

pdf(file = "./data/result/Figure/Fig7-A.pdf", width = 7, height = 5)
tSNE_pl
dev.off()

if (!dir.exists("./data/result/scRNA/")) {
  dir.create("./data/result/scRNA/", recursive = TRUE)
}

saveRDS(seurat_cell_rename, file = "./data/result/scRNA/Seurat_celltype.RDS", compress = TRUE)

if (!dir.exists("./data/result/Figure/")) {
  dir.create("./data/result/Figure/", recursive = TRUE)
}

ggsave("./data/result/Figure/Fig7-A.pdf", tSNE_pl, width = 7, height = 5, dpi = 600)

28.13 总结

在进行生物信息学或单细胞测序数据的分析中，聚类分析是识别不同细胞类型的关键步骤。一旦聚类完成，对类群进行标记基因的识别、细胞自动注释以及文献结果的整合注释，最后对注释结果进行打分评估，是评估聚类效果的重要流程。

接下来，我们针对每个聚类类群进行标记基因的识别。这些标记基因通常是在该类群中特异性高表达，而在其他类群中表达量较低的基因。通过比较不同类群间的基因表达差异，我们可以筛选出每个类群的特异性标记基因，为后续细胞类型的注释提供基础。除此之外，我们还采用了自动注释方法，但由于数据库的局限性和实验条件的差异，自动注释的结果可能并不完全准确。需要结合手动注释结果和文献验证。

最后，我们需要对注释结果进行打分评估，以验证聚类效果的好坏。

系统信息

sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Asia/Shanghai
tzcode source: internal

attached base packages:
[1] stats4    stats     graphics  grDevices datasets  utils     methods  
[8] base     

other attached packages:
 [1] ggtree_3.10.1               patchwork_1.2.0            
 [3] UCell_2.6.2                 SingleR_2.4.1              
 [5] SummarizedExperiment_1.32.0 Biobase_2.62.0             
 [7] GenomicRanges_1.54.1        GenomeInfoDb_1.38.8        
 [9] IRanges_2.36.0              S4Vectors_0.40.2           
[11] BiocGenerics_0.48.1         MatrixGenerics_1.14.0      
[13] matrixStats_1.3.0           scCATCH_3.2.2              
[15] ggunchull_1.0.1             ggh4x_0.2.8                
[17] pheatmap_1.0.12             cowplot_1.1.3              
[19] Seurat_5.0.3                SeuratObject_5.0.2         
[21] sp_2.1-4                    lubridate_1.9.3            
[23] forcats_1.0.0               stringr_1.5.1              
[25] dplyr_1.1.4                 purrr_1.0.2                
[27] readr_2.1.5                 tidyr_1.3.1                
[29] tibble_3.2.1                ggplot2_3.5.1              
[31] tidyverse_2.0.0            

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.22            splines_4.3.3              
  [3] later_1.3.2                 ggplotify_0.1.2            
  [5] bitops_1.0-7                R.oo_1.26.0                
  [7] polyclip_1.10-6             fastDummies_1.7.3          
  [9] lifecycle_1.0.4             globals_0.16.3             
 [11] lattice_0.22-6              MASS_7.3-60.0.1            
 [13] magrittr_2.0.3              plotly_4.10.4              
 [15] rmarkdown_2.26              yaml_2.3.8                 
 [17] httpuv_1.6.15               sctransform_0.4.1          
 [19] spam_2.10-0                 spatstat.sparse_3.0-3      
 [21] reticulate_1.37.0           pbapply_1.7-2              
 [23] RColorBrewer_1.1-3          abind_1.4-5                
 [25] zlibbioc_1.48.2             Rtsne_0.17                 
 [27] R.utils_2.12.3              RCurl_1.98-1.14            
 [29] yulab.utils_0.1.4           GenomeInfoDbData_1.2.11    
 [31] ggrepel_0.9.5               irlba_2.3.5.1              
 [33] listenv_0.9.1               spatstat.utils_3.0-4       
 [35] tidytree_0.4.6              goftest_1.2-3              
 [37] RSpectra_0.16-1             spatstat.random_3.2-3      
 [39] fitdistrplus_1.1-11         parallelly_1.37.1          
 [41] DelayedMatrixStats_1.24.0   leiden_0.4.3.1             
 [43] codetools_0.2-19            DelayedArray_0.28.0        
 [45] alphahull_2.5               tidyselect_1.2.1           
 [47] aplot_0.2.2                 ScaledMatrix_1.10.0        
 [49] sgeostat_1.0-27             spatstat.explore_3.2-7     
 [51] jsonlite_1.8.8              BiocNeighbors_1.20.2       
 [53] progressr_0.14.0            ggridges_0.5.6             
 [55] survival_3.7-0              dbscan_1.1-12              
 [57] tools_4.3.3                 progress_1.2.3             
 [59] treeio_1.26.0               ica_1.0-3                  
 [61] Rcpp_1.0.12                 glue_1.7.0                 
 [63] gridExtra_2.3               SparseArray_1.2.4          
 [65] xfun_0.43                   withr_3.0.0                
 [67] BiocManager_1.30.23         fastmap_1.1.1              
 [69] fansi_1.0.6                 rsvd_1.0.5                 
 [71] digest_0.6.35               gridGraphics_0.5-1         
 [73] timechange_0.3.0            R6_2.5.1                   
 [75] mime_0.12                   colorspace_2.1-0           
 [77] scattermore_1.2             tensor_1.5                 
 [79] spatstat.data_3.0-4         R.methodsS3_1.8.2          
 [81] utf8_1.2.4                  generics_0.1.3             
 [83] renv_1.0.0                  data.table_1.15.4          
 [85] prettyunits_1.2.0           httr_1.4.7                 
 [87] htmlwidgets_1.6.4           S4Arrays_1.2.1             
 [89] uwot_0.2.2                  pkgconfig_2.0.3            
 [91] gtable_0.3.5                lmtest_0.9-40              
 [93] SingleCellExperiment_1.24.0 XVector_0.42.0             
 [95] htmltools_0.5.8.1           dotCall64_1.1-1            
 [97] scales_1.3.0                png_0.1-8                  
 [99] ggfun_0.1.4                 splancs_2.01-44            
[101] knitr_1.46                  rstudioapi_0.16.0          
[103] tzdb_0.4.0                  reshape2_1.4.4             
[105] nlme_3.1-164                cachem_1.0.8               
[107] zoo_1.8-12                  KernSmooth_2.23-22         
[109] parallel_4.3.3              miniUI_0.1.1.1             
[111] pillar_1.9.0                grid_4.3.3                 
[113] vctrs_0.6.5                 RANN_2.6.1                 
[115] promises_1.3.0              BiocSingular_1.18.0        
[117] beachmat_2.18.1             xtable_1.8-4               
[119] cluster_2.1.6               evaluate_0.23              
[121] cli_3.6.2                   compiler_4.3.3             
[123] rlang_1.1.3                 crayon_1.5.2               
[125] future.apply_1.11.2         interp_1.1-6               
[127] fs_1.6.4                    plyr_1.8.9                 
[129] stringi_1.8.4               BiocParallel_1.36.0        
[131] viridisLite_0.4.2           deldir_2.0-4               
[133] munsell_0.5.1               lazyeval_0.2.2             
[135] spatstat.geom_3.2-9         Matrix_1.6-5               
[137] RcppHNSW_0.6.0              hms_1.1.3                  
[139] sparseMatrixStats_1.14.0    future_1.33.2              
[141] shiny_1.8.1.1               ROCR_1.0-11                
[143] memoise_2.0.1               igraph_2.0.3               
[145] ape_5.8