我们将使用系统内安装的R:
# bash命令
source /sibcb/program/install/hdf5-1.10/profile
source /sibcb/program/install/r-4.4/profile
加载R包:
CelliD是一个基于多重对应分析(Multiple Correspondence Analysis,MCA)的单细胞分析软件,它能够衡量给定基因集(gene set)在每个单细胞中的活性,可以分析通路的活性。如果给定的基因集是细胞的marker,则可以用于单细胞的注释。对应的研究论文发表于2021年的NBT,请读全文。作者也开发了R包可用使用CelliD。
pbmc <- readRDS('output/pbmc.RDS')
pbmc
An object of class Seurat
13714 features across 2638 samples within 1 assay
Active assay: RNA (13714 features, 2000 variable features)
3 layers present: counts, data, scale.data
2 dimensional reductions calculated: pca, umap
pbmc <- RunMCA(pbmc, nmcs = 50, reduction.name = "mca", slot = "data")
1.649 sec elapsed
105.452 sec elapsed
4.11 sec elapsed
pbmc
An object of class Seurat
13714 features across 2638 samples within 1 assay
Active assay: RNA (13714 features, 2000 variable features)
3 layers present: counts, data, scale.data
3 dimensional reductions calculated: pca, umap, mca
MCA分解能够同时将细胞和基因投射到同一个空间。比如,我们可以展示CD14 Monocytes
的标识基因CD14
,以及CD4 T cell
的标识基因CD14
:
DimPlotMC(pbmc, reduction = "mca", group.by = "cellType",
features = c("IL7R", "CD14"), as.text = TRUE) +
ggtitle("MCA with some key gene markers")
我们也可以回顾下先前分类的结果:
通过超几何分布的检验,返回Log10转化的adjust P-value.
gsl <- list(BSA = readLines('CelliD/B_cell_activation.txt'))
HGT <- RunCellHGT(pbmc, pathways = gsl, dims = 1:50, n.features = 200, log.trans = T, p.adjust = T)
展示活性的分布:
Bscore <- HGT[,1]
pbmc <- AddMetaData(pbmc, Bscore, col.name = 'B_cell_activation')
FeaturePlot(pbmc, features = c('B_cell_activation'))
label <- rep('others', length(pbmc$cellType))
label[pbmc$cellType == "B"] <- "B"
roc_empirical <- rocit(score = pbmc$B_cell_activation, class = label, negref = "others")
title <- paste0('AUC:', strsplit(as.character(summary(roc_empirical)[4,]), ':')[[1]][2])
Method used: empirical
Number of positive(s): 344
Number of negative(s): 2294
Area under curve: 0.9476
我们分析的第二个数据是来自于胰腺的scRNA-seq Baron et al. 2016. 这个例子在CelliD
的官方文档中有详细的描述。
data("HgProteinCodingGenes")
BaronMatrix <- readRDS('CelliD/BaronMatrix.rds')
BaronMetaData <- readRDS('CelliD/BaronMetaData.rds')
BaronMatrixProt <- BaronMatrix[rownames(BaronMatrix) %in% HgProteinCodingGenes,]
Baron <- CreateSeuratObject(counts = BaronMatrixProt, project = "Baron", min.cells = 5, meta.data = BaronMetaData)
Baron
An object of class Seurat
14804 features across 8569 samples within 1 assay
Active assay: RNA (14804 features, 0 variable features)
1 layer present: counts
Baron <- NormalizeData(Baron)
Baron <- ScaleData(Baron, features = rownames(Baron))
Baron <- RunMCA(Baron)
5.599 sec elapsed
110.844 sec elapsed
14.298 sec elapsed
DimPlotMC(Baron, reduction = "mca", group.by = "cell.type",
features = c("CTRL", "INS", "MYZAP", "CDH11"), as.text = TRUE) +
ggtitle("MCA with some key gene markers")
Baron <- RunPCA(Baron, features = rownames(Baron))
Baron <- RunUMAP(Baron, dims = 1:30)
Baron <- RunTSNE(Baron, dims = 1:30)
Baron <- RunTSNE(Baron, dims = 1:30)
tSNE <- DimPlot(Baron, reduction = "tsne", group.by = "cell.type")+ ggtitle("tSNE") +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
UMAP <- DimPlot(Baron, reduction = "umap", group.by = "cell.type") + ggtitle("UMAP") +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
# ggarrange(tSNE, UMAP, common.legend = TRUE, legend = "top")
读取细胞marker数据文件:
chunks <- strsplit(readLines('CelliD/PanglaoDB_markers_27_Mar_2020.tsv.gz'), '\t')
header <- chunks[[1]]
chunks <- chunks[-1]
df <- as.data.frame(do.call(rbind, chunks), stringsAsFactors = FALSE)
colnames(df) <- gsub(' ', '_', header)
df <- df %>% filter(organ == "Pancreas", str_detect(species, "Hs"))
head(df)
species official_gene_symbol cell_type nicknames
1 Mm Hs CTRB1 Acinar cells CTRB
2 Mm Hs KLK1 Acinar cells Klk6
3 Mm Hs RBPJL Acinar cells RBP-L|SUHL|RBPSUHL
4 Mm Hs PTF1A Acinar cells PTF1-p48|bHLHa29
5 Mm Hs CELA3A Acinar cells ELA3|ELA3A
6 Mm Hs PRSS1 Acinar cells TRY1
ubiquitousness_index
1 0.017
2 0.013
3 0.001
4 0.001
5 0.001
6 0.002
product_description
1 chymotrypsinogen B1
2 kallikrein 1
3 recombination signal binding protein for immunoglobulin kappa J region like
4 pancreas associated transcription factor 1a
5 chymotrypsin like elastase family member 3A
6 serine protease 1
gene_type canonical_marker germ_layer organ
1 protein-coding gene 1 Endoderm Pancreas
2 protein-coding gene 1 Endoderm Pancreas
3 protein-coding gene 1 Endoderm Pancreas
4 protein-coding gene 1 Endoderm Pancreas
5 protein-coding gene 1 Endoderm Pancreas
6 protein-coding gene 1 Endoderm Pancreas
sensitivity_human sensitivity_mouse specificity_human
1 1.0 0.957143 0.000628931
2 0.833333 0.314286 0.00503145
3 0.0 0.0 0.0
4 0.0 0.157143 0.000628931
5 0.833333 0.128571 0.0
6 1.0 0.0285714 0.00597484
specificity_mouse
1 0.0159201
2 0.0128263
3 0.0
4 0.000773445
5 0.0
6 0.0
构建gene set list
:
panglao_pancreas <- df %>% group_by(`cell_type`) %>% summarise(geneset = list(`official_gene_symbol`))
pancreas_gs <- setNames(panglao_pancreas$geneset, panglao_pancreas$`cell_type`)
names(pancreas_gs)
[1] "Acinar cells" "Alpha cells"
[3] "Beta cells" "Delta cells"
[5] "Ductal cells" "Epsilon cells"
[7] "Gamma (PP) cells" "Pancreatic progenitor cells"
[9] "Pancreatic stellate cells" "Peri-islet Schwann cells"
计算gene set的富集性:
HGT_pancreas_gs <- RunCellHGT(Baron, pathways = pancreas_gs, dims = 1:50, n.features = 200)
pancreas_gs_prediction <- rownames(HGT_pancreas_gs)[apply(HGT_pancreas_gs, 2, which.max)]
pancreas_gs_prediction_signif <- ifelse(apply(HGT_pancreas_gs, 2, max)>2, yes = pancreas_gs_prediction, "unassigned")
Baron$pancreas_gs_prediction <- pancreas_gs_prediction_signif
cmatrix <- table(Baron$cell.type, Baron$pancreas_gs_prediction)
kable(cmatrix)
Acinar cells | Alpha cells | Beta cells | Delta cells | Ductal cells | Epsilon cells | Gamma (PP) cells | Pancreatic stellate cells | Peri-islet Schwann cells | unassigned | |
---|---|---|---|---|---|---|---|---|---|---|
Acinar cells | 940 | 0 | 0 | 0 | 8 | 0 | 0 | 3 | 0 | 7 |
Alpha cells | 8 | 1636 | 0 | 2 | 0 | 2 | 13 | 0 | 0 | 665 |
Beta cells | 14 | 1 | 2042 | 37 | 0 | 0 | 8 | 0 | 0 | 423 |
Delta cells | 2 | 0 | 4 | 575 | 0 | 0 | 1 | 0 | 0 | 19 |
Ductal cells | 172 | 0 | 1 | 2 | 793 | 0 | 2 | 0 | 1 | 106 |
Endothelial cells | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 248 |
Epsilon cells | 0 | 0 | 0 | 3 | 0 | 3 | 11 | 0 | 0 | 1 |
Gamma (PP) cells | 0 | 0 | 0 | 0 | 0 | 0 | 255 | 0 | 0 | 0 |
Macrophages | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 55 |
Mast cells | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 25 |
Pancreatic stellate cells | 0 | 0 | 2 | 2 | 1 | 0 | 1 | 356 | 15 | 80 |
Peri-islet Schwann cells | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | 0 |
T cells | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
展示预测的结果:
color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA",
"#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "grey")
ggcolor <- setNames(color,c(sort(unique(Baron$cell.type)), "unassigned"))
OriginalPlot <- DimPlot(Baron, reduction = "tsne", group.by = "cell.type") +
scale_color_manual(values = ggcolor) +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
Predplot1 <- DimPlot(Baron, reduction = "tsne", group.by = "pancreas_gs_prediction") +
scale_color_manual(values = ggcolor) +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(OriginalPlot, Predplot1, legend = "top",common.legend = TRUE)
# render('scRNAseq_CelliD.Rmd', output_file = '/sibcb1/bioinformatics/dataupload/tmp/scRNAseq_CelliD.html')