细胞类型特异性网络(Cell-specific network)是陈洛南组首先提出并应用的方法,我们主要以该论文为基础,学习其在单细胞分析中的应用。本课件所依赖的数据和软件包都可以从原论文的Github下载。
为了计算基因\(x\)和\(y\)在细胞\(k\)中是否存在网络连接(edge),以该细胞为中心,在x轴和y轴分别定义一个框(box),默认情况下,这个框包含10%的细胞。然后该方法计算了三个数:\(n_{x}\)、\(n_{y}\)、\(n_{xy}\),分别代表这两个框中的细胞数以及两个框重合的细胞数。然后,如图所示,构建统计量\(\rho_{xy}\),该统计量呈正态分布,均值为0,在给定的可信度下,就能够判定在细胞\(k\)中\(x\)和\(y\)之间是否有网络连接。
需要注意的是:
csndm
这个函数产生的NDM已经去除了表达量为0的基因的degree,如果不想去除表达量为0的链接,可以使用我们修改过的函数csndm_E0
。另外,csnet
这个函数包括了表达量为0的连接。设定R环境:
数据准备:
ln -s /sibcb1/bioinformatics/biocourse/biocourse01/Seurat/csnData .
ln -s /sibcb1/bioinformatics/biocourse/biocourse01/Seurat/csn .
CSN的一个重要的潜在用途是改善单细胞数据的分类效果。我们以Buettner数据集为例,比较基于表达谱(GEM)和网络度矩阵(NDM)的分类结果。该数据含有182个细胞,分为三类,是mESC处于不同的细胞周期(G1,S,G2/M)。
CSN是一个matlab程序,可参照如下的示例代码进行操作,构建NDM。其中的核心函数是csndm(data,alpha,boxsize,normalize)
:
# 打开交互式matlab
# /sibcb/program/install/matlab/R2016b/bin/matlab
# 在matlab中交互执行
addpath('./csn')
load('csnData/Buettner.mat')
# 查看部分数据,每行是一个基因,每列是一个样本
data(1:10,1:4)
size(data)
# csndm (注意:每行代码后加分号,避免自动输出结果)
=csndm(data, 0.01, 0.1, 0);
ndmdlmwrite('csnOut/Buettner_GEM.txt', data);
dlmwrite('csnOut/Buettner_NDM.txt', ndm);
# 也可以产生均一化的ndm
=csndm(data, 0.01, 0.1, 1);
ndmdlmwrite('csnOut/Buettner_NDM_Normalized.txt', ndm);
我们先直接使用TSNE来进行展示GEM和均一化的NDM降维以后得到的结果。
GEM <- read.csv("csnData/Buettner_GEM.txt",header = F)
NDM <- read.csv("csnData/Buettner_NDM_Normalized.txt",header = F)
Label <- read.csv("csnData/Buettner_Celltype.txt",header = F)
GEM <- log2(GEM + 1)
NDM <- log2(NDM + 1)
tsne <- Rtsne(t(GEM), dims = 2,pca = TRUE, initial_dims=20, pca_scale= TRUE, pca_center=TRUE, perplexity = 30)
Buettner_GEM <- data.frame(tSNE1 = tsne$Y[,1], tSNE2 = tsne$Y[,2], CellType=as.factor(Label$V1))
tsne <- Rtsne(t(NDM), dims = 2,pca = TRUE, initial_dims=20, pca_scale= TRUE, pca_center=TRUE, perplexity = 30)
Buettner_NDM <- data.frame(tSNE1 = tsne$Y[,1], tSNE2 = tsne$Y[,2], CellType=as.factor(Label$V1))
p1 <- ggplot(Buettner_GEM, aes(x = tSNE1, y = tSNE2, color = CellType)) +
geom_point(size = 1) + theme_bw() +
labs(x = "t-SNE 1", y = "t-SNE 2", title = "Buettner_GEM") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p2 <- ggplot(Buettner_NDM, aes(x = tSNE1, y = tSNE2, color = CellType)) +
geom_point(size = 1) + theme_bw() +
labs(x = "t-SNE 1", y = "t-SNE 2", title = "Buettner_NDM") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p1 + p2
可以看出,NDM的降维结果和期望的类别有更好的吻合关系。
我们也可以使用前几次课程所讲述的Seurat流程对NDM进行分析。需要注意的是,标准化的NDM并不是整数矩阵,但是流程仍然可用。
counts = as.matrix(read.table(file = 'csnData/Buettner_NDM_Normalized.txt', row.names = NULL, header = FALSE, sep = ','))
rownames(counts) = paste0('G', 1:nrow(counts))
colnames(counts) = paste0('C', 1:ncol(counts))
obj <- CreateSeuratObject(counts = counts, project = "Buettner", min.cells = 0, min.features = 0)
obj <- obj %>% NormalizeData %>% FindVariableFeatures(nfeatures = 2000) %>%
ScaleData(features = rownames(obj)) %>% RunPCA %>% RunUMAP(dims = 1:20, seed.use=2)
obj[['ref']] = readLines('csnData/Buettner_Celltype.txt')
col_palatte <- c('#6965CE', '#EE5755', '#75E15A', '#D5CE85')
DimPlot(obj, label = F, pt.size = 1, group.by = 'ref', cols = col_palatte)
所谓的Dark Gene是指表达谱没有发生显著变化,而网络度有明显变化的基因。在传统的分析方法中,这些基因会被漏掉,因而叫做Dark Genes。我们用Chu-Type数据集作为例子来描述,这个数据集含有1018个细胞,7种数据类型。
# 进入matlab
# /sibcb/program/install/matlab/R2016b/bin/matlab
addpath('./csn')
=readtable('csnData/ChuType_GEM.csv','ReadVariableNames', true, 'ReadRowNames', true);
data=csndm(table2array(data), 0.01, 0.1, 1);
ndmdlmwrite('csnOut/ChuType_NDM_Normalized.txt', ndm);
=csndm(table2array(data), 0.01, 0.1, 0);
ndmdlmwrite('csnOut/ChuType_NDM.txt', ndm);
exit
我们先使用Seurat流程对表达谱GEM进行分析。
col_palatte <- c('#6965CE', '#EE5755', '#75E15A', '#D5CE85', '#9F33D9', '#9BE2CF', '#69A5C8',
'#FC9500', '#D846AF', '#CE95C4', '#0018F9')
tx = read.csv('csnData/ChuType_TSNE.csv',header = T)
counts_GEM = as.matrix(read.table(file = 'csnData/ChuType_GEM.csv', row.names = 1, header = TRUE, sep = ','))
GEM <- CreateSeuratObject(counts = counts_GEM, project = "ChuType", min.cells = 0, min.features = 0)
GEM <- GEM %>% NormalizeData %>% FindVariableFeatures(nfeatures = 2000) %>%
ScaleData(features = rownames(GEM)) %>% RunPCA %>% RunTSNE(dims = 1:20, seed.use=2)
Idents(GEM) = as.character(tx$CellType)
DimPlot(GEM, label = F, pt.size = 0.5, cols = col_palatte)
可以发现,H1和H9这两类无法区分,其他的类别差别明显。
我们再使用Seurat流程对网络度矩阵NDM进行分析。
counts_NDM = as.matrix(read.table(file = 'csnData/ChuType_NDM_Normalized.txt', row.names = NULL, header = FALSE, sep = ','))
dimnames(counts_NDM) = dimnames(counts_GEM)
NDM <- CreateSeuratObject(counts = counts_NDM, project = "ChuType", min.cells = 0, min.features = 0)
NDM <- NDM %>% NormalizeData %>% FindVariableFeatures(nfeatures = 2000) %>%
ScaleData(features = rownames(NDM)) %>% RunPCA %>% RunTSNE(dims = 1:20, seed.use=2)
Idents(NDM) = as.character(tx$CellType)
DimPlot(NDM, label = F, pt.size = 0.5, cols = col_palatte)
可以发现,7大类基本可以完美区分,再一次显示NDM比GEM有更好的分类效果。
我们以ZHX2和PECAM1为例来展示CSN如何用于新假设的发现。
matlab code
# /sibcb/program/install/matlab/R2016b/bin/matlab
# addpath('./csn')
data=readtable('csnData/ChuType_GEM.csv','ReadVariableNames', true, 'ReadRowNames', true);
Index={'ZHX2','PECAM1'};
GEM=data(ismember(data.Properties.RowNames,Index), :);
gx= round(GEM(1,:).Variables);
gy= round(GEM(2,:).Variables);
edge= csnedge(gx,gy ,0.1);
ZHX2_PECAM1=array2table(edge')
ZHX2_PECAM1.Properties.RowNames=data.Properties.VariableNames;
ZHX2_PECAM1.Properties.VariableNames{'Var1'} = 'ZHX2_PECAM1';
writetable(ZHX2_PECAM1, 'csnOut/ChuType_ZHX2_PECAM1_weight.csv', 'Delimiter', ',','WriteRowNames', true, 'WriteVariableNames', true);
如下所示,从表达谱数据看,ZHX2的表达量很低,在内皮细胞(EC)中没有显示出特异性;而PECAM1的表达量很高,在EC中非常特异。
genes = c('ZHX2', 'PECAM1')
FeaturePlot(GEM, features = genes, ncol=2, cols=c('grey95', 'red3'), label=F, pt.size=0.5)
从网络度的数据来看,这两个基因在EC细胞中都有较高的度。
# 读入未均一化的度矩阵。注意,这个NDM没有去除表达量为0的链接。
Raw_NDM = as.matrix(read.table(file = 'csnData/ChuType_NDM.txt', row.names = NULL, header = FALSE, sep = ','))
dimnames(Raw_NDM) = dimnames(counts_NDM)
NDM = AddMetaData(NDM, t(Raw_NDM[genes,]), paste0(genes, '_degree'))
FeaturePlot(NDM, features = paste0(genes, '_degree'), ncol=2, cols=c('grey95', 'red3'), label=F, pt.size=0.5)
我们还能直接展示在每个细胞中,这两个基因之间连接权重(统计量:\(\rho_{xy}\)标准化以后的值)。
tx = read.table(file = 'csnData/ChuType_ZHX2_PECAM1_weight.csv', row.names = 1, header = TRUE, sep = ',')
rownames(tx) = colnames(NDM)
NDM = AddMetaData(NDM, tx, 'ZHX2_PECAM1_weight')
FeaturePlot(NDM, features = 'ZHX2_PECAM1_weight', ncol=1, cols=c('grey95', 'red3'), label=F, pt.size=0.5)
最后一个例子使用了Chu-Time数据集,包含758个细胞,来自于6个分化时间点。
data_GEM <- read.csv("csnData/ChuTime_GEM.txt",header = F)
data_NDM <- read.csv("csnData/ChuTime_NDM.txt",header = F)
data_Label <- read.csv("csnData/ChuTime_Celltype.txt",header = F)
data_Feature <- read.csv("csnData/ChuTime_Genes.txt",header = F)
idx = data_Feature[,1] == 'POU5F1'
out = data.frame(GEM = as.numeric(data_GEM[idx, ]),
NDM = as.numeric(data_NDM[idx, ]),
TimeSeries = factor(data_Label, levels = as.character(1:6)))
ggplot(out, aes(x = TimeSeries, y = NDM, color = TimeSeries)) +
geom_jitter(size = 1) + theme_bw() +
labs(x = "Time Series", y = "Degree", title = "POU5F1") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
ggplot(out, aes(x = TimeSeries, y = GEM, color = TimeSeries)) +
geom_jitter(size = 1) + theme_bw() +
labs(x = "Time Series", y = "Expression", title = "POU5F1") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
R version 4.3.3 (2024-02-29)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux Server 7.7 (Maipo)
Matrix products: default
BLAS/LAPACK: /sibcb1/bioinformatics/biocourse/biocourse01/miniforge3/envs/scRNA/lib/libopenblasp-r0.3.27.so; LAPACK version 3.12.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: Asia/Shanghai
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods
[7] base
other attached packages:
[1] Rtsne_0.17 ggplot2_3.5.0 patchwork_1.2.0
[4] dplyr_1.1.4 Seurat_5.0.3 SeuratObject_5.0.2
[7] sp_2.1-3 knitr_1.46 rmarkdown_2.26
loaded via a namespace (and not attached):
[1] deldir_2.0-4 pbapply_1.7-2
[3] gridExtra_2.3 rlang_1.1.3
[5] magrittr_2.0.3 RcppAnnoy_0.0.22
[7] spatstat.geom_3.2-9 matrixStats_1.1.0
[9] ggridges_0.5.6 compiler_4.3.3
[11] png_0.1-8 vctrs_0.6.5
[13] reshape2_1.4.4 stringr_1.5.1
[15] pkgconfig_2.0.3 fastmap_1.1.1
[17] labeling_0.4.3 utf8_1.2.4
[19] promises_1.3.0 purrr_1.0.2
[21] xfun_0.43 cachem_1.0.8
[23] jsonlite_1.8.8 goftest_1.2-3
[25] highr_0.10 later_1.3.2
[27] spatstat.utils_3.0-5 irlba_2.3.5.1
[29] parallel_4.3.3 cluster_2.1.6
[31] R6_2.5.1 ica_1.0-3
[33] spatstat.data_3.0-4 stringi_1.8.3
[35] bslib_0.7.0 RColorBrewer_1.1-3
[37] reticulate_1.35.0 parallelly_1.37.1
[39] lmtest_0.9-40 jquerylib_0.1.4
[41] scattermore_1.2 Rcpp_1.0.12
[43] tensor_1.5 future.apply_1.11.2
[45] zoo_1.8-12 sctransform_0.4.1
[47] httpuv_1.6.15 Matrix_1.6-5
[49] splines_4.3.3 igraph_2.0.2
[51] tidyselect_1.2.1 abind_1.4-5
[53] yaml_2.3.8 spatstat.random_3.2-3
[55] spatstat.explore_3.2-6 codetools_0.2-20
[57] miniUI_0.1.1.1 listenv_0.9.1
[59] lattice_0.22-6 tibble_3.2.1
[61] plyr_1.8.9 shiny_1.8.1.1
[63] withr_3.0.0 ROCR_1.0-11
[65] evaluate_0.23 future_1.33.2
[67] fastDummies_1.7.3 survival_3.5-8
[69] polyclip_1.10-6 fitdistrplus_1.1-11
[71] pillar_1.9.0 KernSmooth_2.23-22
[73] plotly_4.10.4 generics_0.1.3
[75] RcppHNSW_0.6.0 munsell_0.5.1
[77] scales_1.3.0 globals_0.16.3
[79] xtable_1.8-4 glue_1.7.0
[81] lazyeval_0.2.2 tools_4.3.3
[83] data.table_1.15.2 RSpectra_0.16-1
[85] distill_1.6 RANN_2.6.1
[87] leiden_0.4.3.1 dotCall64_1.1-1
[89] cowplot_1.1.3 grid_4.3.3
[91] tidyr_1.3.1 colorspace_2.1-0
[93] nlme_3.1-164 cli_3.6.2
[95] spatstat.sparse_3.0-3 spam_2.10-0
[97] fansi_1.0.6 viridisLite_0.4.2
[99] uwot_0.1.16 downlit_0.4.4
[101] gtable_0.3.4 sass_0.4.9
[103] digest_0.6.35 progressr_0.14.0
[105] ggrepel_0.9.5 farver_2.1.1
[107] htmlwidgets_1.6.4 memoise_2.0.1
[109] htmltools_0.5.8.1 lifecycle_1.0.4
[111] httr_1.4.7 mime_0.12
[113] MASS_7.3-60