我们将使用系统内安装的R:
# bash命令
export PATH="/sibcb1/bioinformatics/biocourse/biocourse01/miniforge3/envs/scRNA/bin:$PATH"
加载需要使用的软件包:
gsm cellline subtype
GSM1172844 GSM1172844 184A1 Non-malignant
GSM1172845 GSM1172845 184B5 Non-malignant
GSM1172846 GSM1172846 21MT1 Basal
GSM1172847 GSM1172847 21MT2 Basal
GSM1172848 GSM1172848 21NT Basal
GSM1172849 GSM1172849 21PT Basal
head(fpkm_matrix[,1:5])
GSM1172844 GSM1172845 GSM1172846 GSM1172847
7SK 243.60225124 445.62684061 1111.6138680 3.281208e+03
A1BG 0.30412165 19.16440632 4.3758549 4.836862e+00
A1CF 0.00000000 0.00000000 0.0000000 8.769693e-02
A2M 0.08509533 0.09655169 0.0000000 5.350969e-01
A2ML1 167.74791399 536.95489950 3.2924995 3.468497e-01
A3GALT2 0.00000000 0.00000000 0.4552548 0.000000e+00
GSM1172848
7SK 1.579418e+03
A1BG 4.534119e+00
A1CF 8.424754e-02
A2M 3.954230e-01
A2ML1 5.795524e+00
A3GALT2 1.885893e-01
##获取WGCNA输入的matrix (取方差最大的5000个基因,行是样本,列是基因)
var_top5000<-names(sort(apply(fpkm_matrix,1,var),decreasing = T)[1:5000])
WGCNA_matrix = t(fpkm_matrix[var_top5000,]) ##留下方差最大的基因的表达谱,并进行转置
dim(WGCNA_matrix) ##52行,5000列
[1] 52 5000
软阈值的筛选原则是使构建的网络更符合无标度网络特征
powers = c(c(1:10), seq(from = 12, to=20, by=2)) #定义一系列候选的软阈值
##选择最佳的软阈值,使网络符合无尺度特性
sft = pickSoftThreshold(WGCNA_matrix , powerVector = powers, verbose = 5)
pickSoftThreshold: will use block size 5000.
pickSoftThreshold: calculating connectivity for given powers...
..working on genes 1 through 5000 of 5000
Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
1 1 0.0588 -0.694 0.902 962.000 948.0000 1670.0
2 2 0.4870 -1.700 0.952 293.000 270.0000 777.0
3 3 0.7120 -2.060 0.984 113.000 94.8000 419.0
4 4 0.7890 -2.200 0.995 51.100 39.9000 248.0
5 5 0.8260 -2.290 0.990 25.900 19.2000 156.0
6 6 0.8610 -2.330 0.995 14.300 10.0000 102.0
7 7 0.8710 -2.350 0.992 8.460 5.5900 69.8
8 8 0.8790 -2.310 0.997 5.320 3.3100 49.0
9 9 0.8800 -2.230 0.995 3.520 2.0300 35.2
10 10 0.8880 -2.070 0.985 2.430 1.2800 25.8
11 12 0.9400 -1.940 0.978 1.300 0.5440 19.3
12 14 0.9630 -1.850 0.977 0.792 0.2490 17.0
13 16 0.9550 -1.750 0.955 0.531 0.1190 15.3
14 18 0.9550 -1.610 0.949 0.384 0.0598 14.0
15 20 0.9600 -1.520 0.953 0.295 0.0317 12.9
print(sft$powerEstimate) ## power=6
[1] 6
# 横轴是Soft threshold (power),纵轴是无标度网络的评估参数。
# 数值越高,网络越符合无标度特征
par(mfrow = c(1, 2))
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab = "Soft Threshold (power)",
ylab = "Scale Free Topology Model Fit, \nsigned R^2",
type = "n",
main = paste("Scale independence"))
text(sft$fitIndices[,1],
-sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels = powers, cex = 0.9, col = "red")
# 筛选标准,R-square=0.85
abline(h=0.85,col="red")
## Soft threshold与平均连通性
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab = "Soft Threshold (power)", ylab = "Mean Connectivity",
type = "n", main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels = powers, cex = 0.9, col = "red")
net = blockwiseModules(WGCNA_matrix,
power = sft$powerEstimate,
maxBlockSize = ncol(WGCNA_matrix), #每个区块中的最大基因数
TOMType = "unsigned", #sined表示同时考虑正负相关性
minModuleSize = 50, #最小模块大小,调整会改变模块数量
reassignThreshold = 0,
mergeCutHeight = 0.25, #指定合并模块的高度,调整会改变模块数量
numericLabels = TRUE, #模块标签是否应为数字
pamRespectsDendro = FALSE,
saveTOMs = TRUE, #是否保存拓扑重叠矩阵
saveTOMFileBase = "wgcna/WGCNA-TOM",
verbose = 3)
Calculating module eigengenes block-wise from all genes
Flagging genes and samples with too many missing values...
..step 1
..Working on block 1 .
TOM calculation: adjacency..
..will not use multithreading.
Fraction of slow calculations: 0.000000
..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.
..saving TOM for block 1 into file wgcna/WGCNA-TOM-block.1.RData
....clustering..
....detecting modules..
....calculating module eigengenes..
....checking kME in modules..
..removing 120 genes from module 1 because their KME is too low.
..removing 4 genes from module 2 because their KME is too low.
..removing 6 genes from module 3 because their KME is too low.
..removing 5 genes from module 4 because their KME is too low.
..removing 17 genes from module 5 because their KME is too low.
..removing 2 genes from module 6 because their KME is too low.
..removing 2 genes from module 7 because their KME is too low.
..removing 3 genes from module 10 because their KME is too low.
..removing 9 genes from module 11 because their KME is too low.
..removing 9 genes from module 12 because their KME is too low.
..removing 2 genes from module 15 because their KME is too low.
..removing 2 genes from module 16 because their KME is too low.
..removing 8 genes from module 17 because their KME is too low.
..removing 6 genes from module 18 because their KME is too low.
..removing 11 genes from module 19 because their KME is too low.
..removing 4 genes from module 20 because their KME is too low.
..removing 2 genes from module 23 because their KME is too low.
..merging modules that are too close..
mergeCloseModules: Merging modules whose distance is less than 0.25
Calculating new MEs...
table(net$colors) ##查看共分成多少模块;24个
0 1 2 3 4 5 6 7 8 9 10 11 12 13
330 1334 388 293 274 256 249 173 154 148 138 129 128 122
14 15 16 17 18 19 20 21 22 23
121 117 117 100 91 80 70 68 63 57
moduleColors = labels2colors(net$colors) ##将label转变成颜色
names(moduleColors)<-colnames(WGCNA_matrix)
# 对基因进行聚类,不同module绘制不同颜色
# 其中灰色默认是无法归类于任何模块的那些基因
plotDendroAndColors(net$dendrograms[[1]],
moduleColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
## 计算表达矩阵中每个模块的特征向量
MEs0 <- moduleEigengenes(WGCNA_matrix, moduleColors)$eigengenes
MEs = orderMEs(MEs0) #MEs是每个模块在每个样本里面的特征值
##相当于模块中所有基因表达量的加权
head(MEs[,1:5])
MEdarkred MEblack MEtan MElightyellow
GSM1172844 0.008458430 -0.01659226 -0.004035594 -0.04436948
GSM1172845 0.028803825 -0.02062389 -0.020513508 -0.06612129
GSM1172846 0.002592997 -0.01703464 -0.050634300 -0.05789268
GSM1172847 -0.001168734 -0.01919862 -0.036900123 -0.04353926
GSM1172848 0.033151143 -0.01398153 -0.037133305 -0.06043319
GSM1172849 0.019142664 -0.01281848 -0.038697461 -0.06045317
MEroyalblue
GSM1172844 -0.06042507
GSM1172845 -0.04287740
GSM1172846 -0.06606877
GSM1172847 -0.06853496
GSM1172848 -0.06593047
GSM1172849 -0.06339564
write.csv(MEs, file = "wgcna/ModuleEigengenes.csv") ##保存结果
##将subtype信息转换成连续矩阵计算相关性
levels(datTraits$subtype)##查看性状信息
[1] "Basal" "Claudin-low" "Luminal" "Non-malignant"
design=model.matrix(~0+ datTraits$subtype)
colnames(design)=levels(datTraits$subtype)
##计算模块与形状的相关性及对应的P-value
moduleTraitCor = cor(MEs, design , use = "p")
moduleTraitPvalue = corPvalueStudent(cor=moduleTraitCor, nSamples=nrow(WGCNA_matrix))
## 相关性和P-value整理成矩阵
textMatrix = paste(signif(moduleTraitCor, 2),"\n(",signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = colnames(design),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.8,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
setEnrichrSite("Enrichr")
websiteLive <- TRUE
listEnrichrDbs() ##列出所有enrichR可用的数据集
dbs <- c("KEGG_2021_Human", "GO_Biological_Process_2023") ##KEGG和GO通路
##将每个module富集的通路保存到list
pathway_list_wgcna_kegg<-list()
pathway_list_wgcna_go<-list()
for(i in names(table(moduleColors)))
{
genes<-names(moduleColors)[moduleColors==i]
enriched_i<-enrichr(genes, dbs) ##网卡的话可能需要多跑几次
pathway_list_wgcna_kegg[[i]]<-enriched_i[[1]][enriched_i[[1]]$Adjusted.P.value<0.1,]
pathway_list_wgcna_go[[i]]<-enriched_i[[2]][enriched_i[[2]]$Adjusted.P.value<0.1,]
}
##在module和性状相关性表格后面加上通路富集的信息,可以看到不同subtype特异性的module富集的通路
moduleTraitCor_add_path<-as.data.frame(moduleTraitCor)
moduleTraitCor_add_path$KEGG<-NA
moduleTraitCor_add_path$GO<-NA
moduleTraitCor_add_path[paste0("ME",names(pathway_list_wgcna_kegg)),]$KEGG<-
sapply(pathway_list_wgcna_kegg,function(x) paste(x$Term,collapse=";"))
moduleTraitCor_add_path[paste0("ME",names(pathway_list_wgcna_go)),]$GO<-
sapply(pathway_list_wgcna_go,function(x) paste(x$Term,collapse=";"))
View(moduleTraitCor_add_path)
write.csv(moduleTraitCor_add_path, file = "wgcna/moduleTraitCor_add_path.csv") ##保存结果
## 算出每个模块跟基因的皮尔森相关系数矩阵
geneModuleCor = as.data.frame(cor(WGCNA_matrix, MEs, use = "p"))
## 算出每个性状跟基因的皮尔森相关系数矩阵
## 这里把是否属于 Non-malignant 表型这个变量用0,1进行数值化。
geneTraitCor = as.data.frame(cor(WGCNA_matrix, design , use = "p"))
## 把两个相关性矩阵联合起来,指定感兴趣模块和性状进行分析
module = "cyan"
##Non-malignant
moduleGenes = names(moduleColors[moduleColors==module])
verboseScatterplot(abs(geneModuleCor[moduleGenes, paste0("ME",module)]),
abs(geneTraitCor[moduleGenes, "Non-malignant"]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene correlation for trait",
main = paste("Module membership vs trait\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
##Luminal
verboseScatterplot(abs(geneModuleCor[moduleGenes, paste0("ME",module)]),
abs(geneTraitCor[moduleGenes, "Luminal"]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene correlation for trait",
main = paste("Module membership vs trait\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
##只留下能富集到通路的基因和模块
##在KEGG通路中的基因
<-as.character(unlist(sapply(pathway_list_wgcna_kegg,
genes_to_draw_inpathways_keggfunction(x) unique(strsplit(paste(x$Genes,collapse = ";"),split = ";")[[1]]))))
##在GO通路中的基因
<-as.character(unlist(sapply(pathway_list_wgcna_go,
genes_to_draw_inpathways_gofunction(x) unique(strsplit(paste(x$Genes,collapse = ";"),split = ";")[[1]]))))
##取并集为留下的基因
<-union(genes_to_draw_inpathways_kegg,
genes_to_draw_inpathways
genes_to_draw_inpathways_go) ##富集到通路的modules
<-rownames(moduleTraitCor_add_path
module_withpathways$KEGG!="" | moduleTraitCor_add_path$GO!="",])
[moduleTraitCor_add_path
length(genes_to_draw_inpathways) ##2383
length(module_withpathways) ##22
## TOM文件提取指定的基因
## 拓扑重叠矩阵
load(net$TOMFiles[1], verbose = T)
<- as.matrix(TOM)
TOM = colnames(WGCNA_matrix)
probes = colnames(WGCNA_matrix) %in% genes_to_draw_inpathways
inModule = probes[inModule]
modProbes = TOM[inModule, inModule]
modTOM dimnames(modTOM) = list(modProbes, modProbes)
## 导出cytoscape能读取的NODE和EDGE文件
= exportNetworkToCytoscape(modTOM,
cyt edgeFile = paste("wgcna/CytoscapeInput-edges.txt", sep=""),
nodeFile = paste("wgcna/CytoscapeInput-nodes.txt", sep=""),
weighted = TRUE,
threshold = 0.05, ##大小决定留下的边的数量
nodeNames = modProbes,
nodeAttr = moduleColors[inModule])
<-read.delim("wgcna/CytoscapeInput-edges.txt",stringsAsFactors = F)
Edge_file<-read.delim("wgcna/CytoscapeInput-nodes.txt",stringsAsFactors = F)
Node_file
##fromNode限制50
<-list()
Edge_listfor(i in names(table(Edge_file$fromNode)))
{<-Edge_file[Edge_file$fromNode==i,]
Edge_list[[i]]<-Edge_list[[i]][order(Edge_list[[i]]$weight,decreasing = T),]
Edge_list[[i]]if(nrow(Edge_list[[i]])>50)
{<-Edge_list[[i]][1:50,]
Edge_list[[i]]
}
}<-Reduce(rbind,Edge_list)
Edge_file_50_from
##toNode限制50
<-list()
Edge_listfor(i in names(table(Edge_file_50_from$toNode)))
{<-Edge_file_50_from[Edge_file_50_from$toNode==i,]
Edge_list[[i]]<-Edge_list[[i]][order(Edge_list[[i]]$weight,decreasing = T),]
Edge_list[[i]]if(nrow(Edge_list[[i]])>50)
{<-Edge_list[[i]][1:50,]
Edge_list[[i]]
}
}<-Reduce(rbind,Edge_list)
Edge_file_50dim(Edge_file_50) ## 24364
<-Node_file[Node_file$nodeName %in%
Node_file_50union(Edge_file_50$fromNode,Edge_file_50$toNode),]
dim(Node_file_50) ##1993基因
##给Node_file添加亚群之间fold-change的信息
<-log2(fpkm_matrix+1) ##log2 FPKM
fpkm_matrix_log2rownames(Node_file_50)<-Node_file_50$nodeName
=rownames(datTraits[datTraits$subtype=="Basal",])
sample_case=rownames(datTraits[datTraits$subtype!="Basal",])
sample_ctrl$Fold_Basal<-sapply(rownames(Node_file_50),function(x)
Node_file_50mean(as.numeric(fpkm_matrix_log2[x,sample_case]))-
mean(as.numeric(fpkm_matrix_log2[x,sample_ctrl])))
=rownames(datTraits[datTraits$subtype=="Claudin-low",])
sample_case=rownames(datTraits[datTraits$subtype!="Claudin-low",])
sample_ctrl$Fold_Claudinlow<-sapply(rownames(Node_file_50),function(x)
Node_file_50mean(as.numeric(fpkm_matrix_log2[x,sample_case]))-
mean(as.numeric(fpkm_matrix_log2[x,sample_ctrl])))
=rownames(datTraits[datTraits$subtype=="Luminal",])
sample_case=rownames(datTraits[datTraits$subtype!="Luminal",])
sample_ctrl$Fold_Luminal<-sapply(rownames(Node_file_50),function(x)
Node_file_50mean(as.numeric(fpkm_matrix_log2[x,sample_case]))-
mean(as.numeric(fpkm_matrix_log2[x,sample_ctrl])))
=rownames(datTraits[datTraits$subtype=="Non-malignant",])
sample_case=rownames(datTraits[datTraits$subtype!="Non-malignant",])
sample_ctrl$Fold_Non_malignant<-sapply(rownames(Node_file_50),function(x)
Node_file_50mean(as.numeric(fpkm_matrix_log2[x,sample_case]))-
mean(as.numeric(fpkm_matrix_log2[x,sample_ctrl])))
##输出筛选过的边和节点
write.table(Node_file_50,"wgcna/Node_file_50.txt",quote = F,row.names = F,sep = "\t")
write.table(Edge_file_50,"wgcna/Edge_file_50.txt",quote = F,row.names = F,sep = "\t")
本文档由陈洛南组唐诗婕博士构建,如需帮助,请联系bioinformatics[at]sibcb.ac.cn
获得唐博士的联系方式。