Bioinformatics training course: bulk RNA-seq: （WGCNA）共表达网络分析

全局设定

我们将使用系统内安装的R：

# bash命令
export PATH="/sibcb1/bioinformatics/biocourse/biocourse01/miniforge3/envs/scRNA/bin:$PATH"

加载需要使用的软件包：

## if (!require("BiocManager", quietly = TRUE))
##  install.packages("BiocManager")
## BiocManager::install("WGCNA") ##按照R包WGCNA
library(WGCNA) ##加载R包WGCNA
library(enrichR) ##用enrichR包进行富集

数据加载和预处理

##加载表达谱以及对应表型信息
load("wgcna/wgcna-input-GSE48213.RData")
head(datTraits)

                  gsm cellline       subtype
GSM1172844 GSM1172844    184A1 Non-malignant
GSM1172845 GSM1172845    184B5 Non-malignant
GSM1172846 GSM1172846    21MT1         Basal
GSM1172847 GSM1172847    21MT2         Basal
GSM1172848 GSM1172848     21NT         Basal
GSM1172849 GSM1172849     21PT         Basal

head(fpkm_matrix[,1:5])

          GSM1172844   GSM1172845   GSM1172846   GSM1172847
7SK     243.60225124 445.62684061 1111.6138680 3.281208e+03
A1BG      0.30412165  19.16440632    4.3758549 4.836862e+00
A1CF      0.00000000   0.00000000    0.0000000 8.769693e-02
A2M       0.08509533   0.09655169    0.0000000 5.350969e-01
A2ML1   167.74791399 536.95489950    3.2924995 3.468497e-01
A3GALT2   0.00000000   0.00000000    0.4552548 0.000000e+00
          GSM1172848
7SK     1.579418e+03
A1BG    4.534119e+00
A1CF    8.424754e-02
A2M     3.954230e-01
A2ML1   5.795524e+00
A3GALT2 1.885893e-01

##获取WGCNA输入的matrix (取方差最大的5000个基因，行是样本，列是基因)
var_top5000<-names(sort(apply(fpkm_matrix,1,var),decreasing = T)[1:5000])
WGCNA_matrix = t(fpkm_matrix[var_top5000,]) ##留下方差最大的基因的表达谱，并进行转置
dim(WGCNA_matrix) ##52行,5000列

[1]   52 5000

网络构建和模块检测

选择软阈值

软阈值的筛选原则是使构建的网络更符合无标度网络特征

powers = c(c(1:10), seq(from = 12, to=20, by=2)) #定义一系列候选的软阈值
##选择最佳的软阈值，使网络符合无尺度特性
sft = pickSoftThreshold(WGCNA_matrix , powerVector = powers, verbose = 5)

pickSoftThreshold: will use block size 5000.
 pickSoftThreshold: calculating connectivity for given powers...
   ..working on genes 1 through 5000 of 5000
   Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
1      1   0.0588 -0.694          0.902 962.000  948.0000 1670.0
2      2   0.4870 -1.700          0.952 293.000  270.0000  777.0
3      3   0.7120 -2.060          0.984 113.000   94.8000  419.0
4      4   0.7890 -2.200          0.995  51.100   39.9000  248.0
5      5   0.8260 -2.290          0.990  25.900   19.2000  156.0
6      6   0.8610 -2.330          0.995  14.300   10.0000  102.0
7      7   0.8710 -2.350          0.992   8.460    5.5900   69.8
8      8   0.8790 -2.310          0.997   5.320    3.3100   49.0
9      9   0.8800 -2.230          0.995   3.520    2.0300   35.2
10    10   0.8880 -2.070          0.985   2.430    1.2800   25.8
11    12   0.9400 -1.940          0.978   1.300    0.5440   19.3
12    14   0.9630 -1.850          0.977   0.792    0.2490   17.0
13    16   0.9550 -1.750          0.955   0.531    0.1190   15.3
14    18   0.9550 -1.610          0.949   0.384    0.0598   14.0
15    20   0.9600 -1.520          0.953   0.295    0.0317   12.9

print(sft$powerEstimate) ## power=6

[1] 6

# 横轴是Soft threshold (power)，纵轴是无标度网络的评估参数。
# 数值越高，网络越符合无标度特征 
par(mfrow = c(1, 2))
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab = "Soft Threshold (power)", 
     ylab = "Scale Free Topology Model Fit, \nsigned R^2",
     type = "n", 
     main = paste("Scale independence"))
text(sft$fitIndices[,1],
     -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels = powers, cex = 0.9, col = "red")
# 筛选标准，R-square=0.85
abline(h=0.85,col="red") 

## Soft threshold与平均连通性
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab = "Soft Threshold (power)", ylab = "Mean Connectivity",
     type = "n", main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels = powers, cex = 0.9, col = "red")

构建网络并检测模块

net = blockwiseModules(WGCNA_matrix, 
                       power = sft$powerEstimate,
                       maxBlockSize = ncol(WGCNA_matrix), #每个区块中的最大基因数
                       TOMType = "unsigned", #sined表示同时考虑正负相关性
                       minModuleSize = 50, #最小模块大小,调整会改变模块数量
                       reassignThreshold = 0, 
                       mergeCutHeight = 0.25, #指定合并模块的高度,调整会改变模块数量 
                       numericLabels = TRUE, #模块标签是否应为数字
                       pamRespectsDendro = FALSE,
                       saveTOMs = TRUE, #是否保存拓扑重叠矩阵
                       saveTOMFileBase = "wgcna/WGCNA-TOM",
                       verbose = 3)

 Calculating module eigengenes block-wise from all genes
   Flagging genes and samples with too many missing values...
    ..step 1
 ..Working on block 1 .
    TOM calculation: adjacency..
    ..will not use multithreading.
     Fraction of slow calculations: 0.000000
    ..connectivity..
    ..matrix multiplication (system BLAS)..
    ..normalization..
    ..done.
   ..saving TOM for block 1 into file wgcna/WGCNA-TOM-block.1.RData
 ....clustering..
 ....detecting modules..
 ....calculating module eigengenes..
 ....checking kME in modules..
     ..removing 120 genes from module 1 because their KME is too low.
     ..removing 4 genes from module 2 because their KME is too low.
     ..removing 6 genes from module 3 because their KME is too low.
     ..removing 5 genes from module 4 because their KME is too low.
     ..removing 17 genes from module 5 because their KME is too low.
     ..removing 2 genes from module 6 because their KME is too low.
     ..removing 2 genes from module 7 because their KME is too low.
     ..removing 3 genes from module 10 because their KME is too low.
     ..removing 9 genes from module 11 because their KME is too low.
     ..removing 9 genes from module 12 because their KME is too low.
     ..removing 2 genes from module 15 because their KME is too low.
     ..removing 2 genes from module 16 because their KME is too low.
     ..removing 8 genes from module 17 because their KME is too low.
     ..removing 6 genes from module 18 because their KME is too low.
     ..removing 11 genes from module 19 because their KME is too low.
     ..removing 4 genes from module 20 because their KME is too low.
     ..removing 2 genes from module 23 because their KME is too low.
 ..merging modules that are too close..
     mergeCloseModules: Merging modules whose distance is less than 0.25
       Calculating new MEs...

table(net$colors) ##查看共分成多少模块；24个


   0    1    2    3    4    5    6    7    8    9   10   11   12   13 
 330 1334  388  293  274  256  249  173  154  148  138  129  128  122 
  14   15   16   17   18   19   20   21   22   23 
 121  117  117  100   91   80   70   68   63   57

moduleColors = labels2colors(net$colors) ##将label转变成颜色
names(moduleColors)<-colnames(WGCNA_matrix)

# 对基因进行聚类，不同module绘制不同颜色
# 其中灰色默认是无法归类于任何模块的那些基因
plotDendroAndColors(net$dendrograms[[1]], 
                    moduleColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

## 计算表达矩阵中每个模块的特征向量
MEs0 <- moduleEigengenes(WGCNA_matrix, moduleColors)$eigengenes
MEs = orderMEs(MEs0) #MEs是每个模块在每个样本里面的特征值
##相当于模块中所有基因表达量的加权
head(MEs[,1:5])

              MEdarkred     MEblack        MEtan MElightyellow
GSM1172844  0.008458430 -0.01659226 -0.004035594   -0.04436948
GSM1172845  0.028803825 -0.02062389 -0.020513508   -0.06612129
GSM1172846  0.002592997 -0.01703464 -0.050634300   -0.05789268
GSM1172847 -0.001168734 -0.01919862 -0.036900123   -0.04353926
GSM1172848  0.033151143 -0.01398153 -0.037133305   -0.06043319
GSM1172849  0.019142664 -0.01281848 -0.038697461   -0.06045317
           MEroyalblue
GSM1172844 -0.06042507
GSM1172845 -0.04287740
GSM1172846 -0.06606877
GSM1172847 -0.06853496
GSM1172848 -0.06593047
GSM1172849 -0.06339564

write.csv(MEs, file = "wgcna/ModuleEigengenes.csv") ##保存结果

##将subtype信息转换成连续矩阵计算相关性
levels(datTraits$subtype)##查看性状信息

[1] "Basal"         "Claudin-low"   "Luminal"       "Non-malignant"

design=model.matrix(~0+ datTraits$subtype) 
colnames(design)=levels(datTraits$subtype)

##计算模块与形状的相关性及对应的P-value
moduleTraitCor = cor(MEs, design , use = "p") 
moduleTraitPvalue = corPvalueStudent(cor=moduleTraitCor, nSamples=nrow(WGCNA_matrix)) 

## 相关性和P-value整理成矩阵
textMatrix = paste(signif(moduleTraitCor, 2),"\n(",signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)

par(mar = c(6, 8.5, 3, 3))
labeledHeatmap(Matrix = moduleTraitCor,
                 xLabels = colnames(design),
                 yLabels = names(MEs),
                 ySymbols = names(MEs),
                 colorLabels = FALSE,
                 colors = greenWhiteRed(50),
                 textMatrix = textMatrix,
                 setStdMargins = FALSE,
                 cex.text = 0.8,
                 zlim = c(-1,1),
                 main = paste("Module-trait relationships"))

模块功能富集

setEnrichrSite("Enrichr")
websiteLive <- TRUE
listEnrichrDbs() ##列出所有enrichR可用的数据集
dbs <- c("KEGG_2021_Human", "GO_Biological_Process_2023") ##KEGG和GO通路

##将每个module富集的通路保存到list
pathway_list_wgcna_kegg<-list()
pathway_list_wgcna_go<-list()
for(i in names(table(moduleColors)))
{
  genes<-names(moduleColors)[moduleColors==i]
  enriched_i<-enrichr(genes, dbs) ##网卡的话可能需要多跑几次
  pathway_list_wgcna_kegg[[i]]<-enriched_i[[1]][enriched_i[[1]]$Adjusted.P.value<0.1,]
  pathway_list_wgcna_go[[i]]<-enriched_i[[2]][enriched_i[[2]]$Adjusted.P.value<0.1,]
}
  
##在module和性状相关性表格后面加上通路富集的信息，可以看到不同subtype特异性的module富集的通路
moduleTraitCor_add_path<-as.data.frame(moduleTraitCor)
moduleTraitCor_add_path$KEGG<-NA
moduleTraitCor_add_path$GO<-NA
moduleTraitCor_add_path[paste0("ME",names(pathway_list_wgcna_kegg)),]$KEGG<-
       sapply(pathway_list_wgcna_kegg,function(x) paste(x$Term,collapse=";"))
moduleTraitCor_add_path[paste0("ME",names(pathway_list_wgcna_go)),]$GO<-
       sapply(pathway_list_wgcna_go,function(x) paste(x$Term,collapse=";"))
View(moduleTraitCor_add_path)
write.csv(moduleTraitCor_add_path, file = "wgcna/moduleTraitCor_add_path.csv") ##保存结果

感兴趣性状的模块的具体基因分析

## 算出每个模块跟基因的皮尔森相关系数矩阵
geneModuleCor = as.data.frame(cor(WGCNA_matrix, MEs, use = "p"))

## 算出每个性状跟基因的皮尔森相关系数矩阵
## 这里把是否属于 Non-malignant 表型这个变量用0,1进行数值化。
geneTraitCor = as.data.frame(cor(WGCNA_matrix, design , use = "p"))

## 把两个相关性矩阵联合起来,指定感兴趣模块和性状进行分析
module = "cyan"
##Non-malignant
moduleGenes = names(moduleColors[moduleColors==module])
verboseScatterplot(abs(geneModuleCor[moduleGenes, paste0("ME",module)]),
                   abs(geneTraitCor[moduleGenes, "Non-malignant"]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "Gene correlation for trait",
                   main = paste("Module membership vs trait\n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)

##Luminal
verboseScatterplot(abs(geneModuleCor[moduleGenes, paste0("ME",module)]),
                     abs(geneTraitCor[moduleGenes, "Luminal"]),
                     xlab = paste("Module Membership in", module, "module"),
                     ylab = "Gene correlation for trait",
                     main = paste("Module membership vs trait\n"),
                     cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)

模块可视化（cytoscape）

##只留下能富集到通路的基因和模块
##在KEGG通路中的基因
genes_to_draw_inpathways_kegg<-as.character(unlist(sapply(pathway_list_wgcna_kegg,
function(x)  unique(strsplit(paste(x$Genes,collapse = ";"),split = ";")[[1]]))))

##在GO通路中的基因
genes_to_draw_inpathways_go<-as.character(unlist(sapply(pathway_list_wgcna_go,
function(x) unique(strsplit(paste(x$Genes,collapse = ";"),split = ";")[[1]]))))

##取并集为留下的基因
genes_to_draw_inpathways<-union(genes_to_draw_inpathways_kegg,
                                genes_to_draw_inpathways_go) 
##富集到通路的modules
module_withpathways<-rownames(moduleTraitCor_add_path
    [moduleTraitCor_add_path$KEGG!="" | moduleTraitCor_add_path$GO!="",])
                                                        
length(genes_to_draw_inpathways)  ##2383
length(module_withpathways) ##22

## TOM文件提取指定的基因
## 拓扑重叠矩阵                                                        
load(net$TOMFiles[1], verbose = T) 
TOM <- as.matrix(TOM)                                                        
probes = colnames(WGCNA_matrix)
inModule =  colnames(WGCNA_matrix) %in% genes_to_draw_inpathways
modProbes = probes[inModule]
modTOM = TOM[inModule, inModule]
dimnames(modTOM) = list(modProbes, modProbes)
## 导出cytoscape能读取的NODE和EDGE文件
cyt = exportNetworkToCytoscape(modTOM,
  edgeFile = paste("wgcna/CytoscapeInput-edges.txt", sep=""),
  nodeFile = paste("wgcna/CytoscapeInput-nodes.txt", sep=""),
  weighted = TRUE,
  threshold = 0.05, ##大小决定留下的边的数量
  nodeNames = modProbes,
  nodeAttr = moduleColors[inModule])

Edge_file<-read.delim("wgcna/CytoscapeInput-edges.txt",stringsAsFactors = F)
Node_file<-read.delim("wgcna/CytoscapeInput-nodes.txt",stringsAsFactors = F)

##fromNode限制50
Edge_list<-list()
for(i in names(table(Edge_file$fromNode)))
{
  Edge_list[[i]]<-Edge_file[Edge_file$fromNode==i,]
  Edge_list[[i]]<-Edge_list[[i]][order(Edge_list[[i]]$weight,decreasing = T),]
  if(nrow(Edge_list[[i]])>50)
  {
    Edge_list[[i]]<-Edge_list[[i]][1:50,]
  }
}
Edge_file_50_from<-Reduce(rbind,Edge_list)

##toNode限制50
Edge_list<-list()
for(i in names(table(Edge_file_50_from$toNode)))
{
  Edge_list[[i]]<-Edge_file_50_from[Edge_file_50_from$toNode==i,]
  Edge_list[[i]]<-Edge_list[[i]][order(Edge_list[[i]]$weight,decreasing = T),]
  if(nrow(Edge_list[[i]])>50)
  {
    Edge_list[[i]]<-Edge_list[[i]][1:50,]
  }
}
Edge_file_50<-Reduce(rbind,Edge_list)
dim(Edge_file_50) ## 24364 
Node_file_50<-Node_file[Node_file$nodeName %in% 
              union(Edge_file_50$fromNode,Edge_file_50$toNode),]
dim(Node_file_50) ##1993基因

##给Node_file添加亚群之间fold-change的信息
fpkm_matrix_log2<-log2(fpkm_matrix+1) ##log2 FPKM
rownames(Node_file_50)<-Node_file_50$nodeName

sample_case=rownames(datTraits[datTraits$subtype=="Basal",])
sample_ctrl=rownames(datTraits[datTraits$subtype!="Basal",])
Node_file_50$Fold_Basal<-sapply(rownames(Node_file_50),function(x) 
mean(as.numeric(fpkm_matrix_log2[x,sample_case]))-
mean(as.numeric(fpkm_matrix_log2[x,sample_ctrl])))

sample_case=rownames(datTraits[datTraits$subtype=="Claudin-low",])
sample_ctrl=rownames(datTraits[datTraits$subtype!="Claudin-low",])
Node_file_50$Fold_Claudinlow<-sapply(rownames(Node_file_50),function(x)
mean(as.numeric(fpkm_matrix_log2[x,sample_case]))-
mean(as.numeric(fpkm_matrix_log2[x,sample_ctrl])))

sample_case=rownames(datTraits[datTraits$subtype=="Luminal",])
sample_ctrl=rownames(datTraits[datTraits$subtype!="Luminal",])
Node_file_50$Fold_Luminal<-sapply(rownames(Node_file_50),function(x) 
mean(as.numeric(fpkm_matrix_log2[x,sample_case]))-
mean(as.numeric(fpkm_matrix_log2[x,sample_ctrl])))

sample_case=rownames(datTraits[datTraits$subtype=="Non-malignant",])
sample_ctrl=rownames(datTraits[datTraits$subtype!="Non-malignant",])
Node_file_50$Fold_Non_malignant<-sapply(rownames(Node_file_50),function(x) 
mean(as.numeric(fpkm_matrix_log2[x,sample_case]))-
mean(as.numeric(fpkm_matrix_log2[x,sample_ctrl])))

##输出筛选过的边和节点
write.table(Node_file_50,"wgcna/Node_file_50.txt",quote = F,row.names = F,sep = "\t")
write.table(Edge_file_50,"wgcna/Edge_file_50.txt",quote = F,row.names = F,sep = "\t")

致谢

本文档由陈洛南组唐诗婕博士构建，如需帮助，请联系bioinformatics[at]sibcb.ac.cn获得唐博士的联系方式。