Bulk RNA-seq的差异表达分析

全局设定

我们将使用系统内安装的R:

/sibcb/program/install/r-4.0/bin/R

设定R的library path:

.libPaths('/sibcb1/bioinformatics/training_shared/software/rlib/r-4.0_lib/')
library("tximport")
library("reshape2")
library("DESeq2")
library("ggplot2")
library("pheatmap")
library("RColorBrewer")
library("pasilla")

开始前的文件准备:

ln -s /sibcb1/bioinformatics/biocourse/biocourse01/bulkRNAseq/idx
ln -s /sibcb1/bioinformatics/biocourse/biocourse01/bulkRNAseq/featureCounts
ln -s /sibcb1/bioinformatics/biocourse/biocourse01/bulkRNAseq/kallisto

读入Kallistio的定量结果

定量结果

SRX = list.files("kallisto")
files = paste0("kallisto/", SRX, "/abundance.tsv")
names(files) = SRX
cbind(files)
        files                           
ctrl1   "kallisto/ctrl1/abundance.tsv"  
ctrl2   "kallisto/ctrl2/abundance.tsv"  
ctrl3   "kallisto/ctrl3/abundance.tsv"  
stroke1 "kallisto/stroke1/abundance.tsv"
stroke2 "kallisto/stroke2/abundance.tsv"
stroke3 "kallisto/stroke3/abundance.tsv"

基因注释表

tx2gene = read.table(file = 'idx/kallisto/tx2g_symbol.txt', row.names = NULL, header = FALSE, sep = "\t")
head(tx2gene)
                 V1          V2                V3
1 ENST00000456328.2     DDX11L1 ENSG00000223972.5
2 ENST00000450305.2     DDX11L1 ENSG00000223972.5
3 ENST00000488147.1      WASH7P ENSG00000227232.5
4 ENST00000619216.1   MIR6859-1 ENSG00000278267.1
5 ENST00000473358.1 MIR1302-2HG ENSG00000243485.5
6 ENST00000469289.1 MIR1302-2HG ENSG00000243485.5

tximport

txi <- tximport(files, type = "kallisto", tx2gene = tx2gene,ignoreAfterBar = TRUE )
names(txi)
[1] "abundance"           "counts"              "infReps"            
[4] "length"              "countsFromAbundance"

相对表达量 (TPM):

tpm = log2(txi$abundance + 1)
head(txi$abundance)
             ctrl1      ctrl2      ctrl3    stroke1     stroke2
A1BG     7.0668450 12.0477869  6.8267015 0.91054220 1.176894900
A1BG-AS1 2.1620080  3.1512340  2.0337040 0.19374250 0.246123092
A1CF     0.0575614  0.0635564  0.0664542 0.01326064 0.005202681
A2M      7.5932940  6.7255700 21.7159070 0.14195205 0.359023400
A2M-AS1  0.8916910  1.0821252  2.4603140 0.05171970 0.111497000
A2ML1    0.4142338  0.6031055  0.6026923 0.06351190 0.044580240
            stroke3
A1BG     1.12601700
A1BG-AS1 0.18466270
A1CF     0.00000000
A2M      0.48542780
A2M-AS1  0.04178862
A2ML1    0.00966699

质量控制

design matirx

sampleT = data.frame(group = gsub("[123]", "", colnames(tpm)), row.names = colnames(tpm))
sampleT
         group
ctrl1     ctrl
ctrl2     ctrl
ctrl3     ctrl
stroke1 stroke
stroke2 stroke
stroke3 stroke

PCA

value <- apply(tpm, 1, sd)
idx <- value > quantile(value, 1-2000/nrow(tpm)) 
subM <- tpm[idx, ]

pca <- summary(prcomp(t(subM), center=TRUE, scale=TRUE))
pcadata <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2])
pcaframe <- data.frame(pcadata, sampleT)
pcaframe
              PC1        PC2  group
ctrl1   -36.81307  5.3101120   ctrl
ctrl2   -42.74179 -7.0657879   ctrl
ctrl3   -40.94367  1.9394916   ctrl
stroke1  42.37128  0.7496646 stroke
stroke2  33.60247  1.5852164 stroke
stroke3  44.52478 -2.5186967 stroke
p <- ggplot(pcaframe, aes(x=PC1, y=PC2, color=group)) + geom_point(size=1.5)
p <- p + theme_bw() + labs(x = paste0("PC1 (", pca$importance[2,"PC1"]*100, "%)"), y = paste0("PC2 (", pca$importance[2,"PC2"]*100, "%)"))
p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p

Heatmap

pheatmap(subM, 
    cluster_rows = T, 
    cluster_cols = F, 
    show_rownames = F,
    annotation = sampleT, 
    border_color = NA,    
    fontsize = 12, 
    scale = "row", 
    rev(RColorBrewer::brewer.pal(n = 10, name = "RdBu")))

差异表达分析

构建DESeq2对象

sampleT$group = factor(sampleT$group)
sampleT$group
[1] ctrl   ctrl   ctrl   stroke stroke stroke
Levels: ctrl stroke
dds <- DESeqDataSetFromTximport(txi, sampleT, ~group)
dds
class: DESeqDataSet 
dim: 60591 6 
metadata(1): version
assays(2): counts avgTxLength
rownames(60591): A1BG A1BG-AS1 ... ZYXP1 ZZEF1
rowData names(0):
colnames(6): ctrl1 ctrl2 ... stroke2 stroke3
colData names(1): group

差异表达分析

dds <- DESeq(dds)
res <- results(dds, contrast=c("group", "stroke", "ctrl"))
head(res)
log2 fold change (MLE): group stroke vs ctrl 
Wald test p-value: group stroke vs ctrl 
DataFrame with 6 rows and 6 columns
          baseMean log2FoldChange     lfcSE       stat     pvalue
         <numeric>      <numeric> <numeric>  <numeric>  <numeric>
A1BG      138.9053      0.5696487  0.366932  1.5524628 0.12055154
A1BG-AS1   94.0295     -0.0194117  0.329088 -0.0589862 0.95296312
A1CF       12.2568     -0.0143727  0.875907 -0.0164089 0.98690817
A2M       175.9281     -1.5962724  0.612288 -2.6070619 0.00913229
A2M-AS1    70.6930     -0.9688552  0.494190 -1.9604894 0.04993861
A2ML1      20.0849     -0.3049334  0.624204 -0.4885156 0.62518465
              padj
         <numeric>
A1BG     0.2522373
A1BG-AS1 0.9747968
A1CF     0.9929060
A2M      0.0325865
A2M-AS1  0.1296658
A2ML1    0.7703000
DEG = rep("NC", nrow(res))
DEG[(res$log2FoldChange > 0) & (res$padj < 0.05)] = "UP"
DEG[(res$log2FoldChange < 0) & (res$padj < 0.05)] = "DN"
res$DEG = DEG
summary(factor(DEG))
   DN    NC    UP 
 4366 51576  4649 

火山图

res$LogFDR <- -log10(res$padj)

p <- ggplot(data.frame(res), aes(x = log2FoldChange, y = LogFDR, color = DEG)) + xlim(-20, 20)
p <- p + theme_bw() + labs(x = "Log2 fold-change", y = "-log10(FDR)", title = "")
p <- p + geom_point(size = 0.1)
p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + scale_color_manual(values = c("UP" = "red", "DN" = "dodgerblue4", "NC" = "grey"))
p

查看size factor

需要注意的是,在tximport流程中,每个样本的每个基因有不同的size Factor:

sm = normalizationFactors(dds)
head(sm)
            ctrl1    ctrl2    ctrl3   stroke1   stroke2   stroke3
A1BG     2.712610 2.421541 2.578612 0.4481615 0.5277385 0.2496212
A1BG-AS1 2.670685 2.355683 2.526540 0.3544908 0.5776463 0.3072326
A1CF     3.030272 2.952609 3.031442 0.1937348 0.6274793 0.3032886
A2M      2.452045 3.013408 2.593027 0.4594420 0.6771202 0.1677684
A2M-AS1  2.602384 2.435514 2.612423 0.3835203 0.5086496 0.3095903
A2ML1    2.280079 2.551780 2.102948 0.4340998 0.3336740 0.5642435

如果想查看样本水平的size Factor:

nm <- assays(dds)[["avgTxLength"]]
sf <- estimateSizeFactorsForMatrix(counts(dds) / nm)
sf
    ctrl1     ctrl2     ctrl3   stroke1   stroke2   stroke3 
2.6208348 2.5460171 2.6137423 0.3837055 0.5411728 0.3090345 

使用spike-in

读入featureCounts

当实验中加入了spike-in,我们也可以考虑使用spike-in进行校正。假设spike-in的read count也在输入的矩阵中,我们可以使用如下流程进行分析。

tx = read.table(file = "featureCounts/featureCounts_table.txt", row.names = NULL, header = TRUE, sep = "\t")
counts = as.matrix(tx[,8:13])
colnames(counts) = gsub("Aligned", "", sapply(strsplit(colnames(counts), "\\."), function(z) z[2]))
rownames(counts) = paste0(tx$Geneid, ' ', tx$gene_name)
head(counts)
                              ctrl1 ctrl2 ctrl3 stroke1 stroke2
ENSG00000223972.5 DDX11L1        63     5    15      16      17
ENSG00000227232.5 WASH7P        306   352   268      69     181
ENSG00000278267.1 MIR6859-1       0     0     0       0       0
ENSG00000243485.5 MIR1302-2HG     0     0     0       0       0
ENSG00000284332.1 MIR1302-2       0     0     0       0       0
ENSG00000237613.2 FAM138A         0     0     0       0       0
                              stroke3
ENSG00000223972.5 DDX11L1          42
ENSG00000227232.5 WASH7P           89
ENSG00000278267.1 MIR6859-1         0
ENSG00000243485.5 MIR1302-2HG       0
ENSG00000284332.1 MIR1302-2         0
ENSG00000237613.2 FAM138A           0

构建对象:

dds2 <- DESeqDataSetFromMatrix(countData = counts,
                              colData = sampleT,
                              design = ~ group)
dds2
class: DESeqDataSet 
dim: 60728 6 
metadata(1): version
assays(1): counts
rownames(60728): ENSG00000223972.5 DDX11L1 ENSG00000227232.5
  WASH7P ... ENSG00000278625.1 AC237676.1 ENSG00000277374.1
  AC116622.1
rowData names(0):
colnames(6): ctrl1 ctrl2 ... stroke2 stroke3
colData names(1): group

重置size factor

为了演示的方便,我们从上面分析结果中随机选择2000个没有调变的基因作为spike-in基因

# 随机选择2000个基因
spikein = sample(rownames(res)[res$DEG == "NC"], 2000)

# 计算size factor
idx = sapply(strsplit(rownames(dds2), " "), function(z) z[2]) %in% spikein
dds2 = estimateSizeFactors(dds2, controlGenes = idx)
sizeFactors(dds2)
    ctrl1     ctrl2     ctrl3   stroke1   stroke2   stroke3 
2.6814236 2.6833622 2.7274221 0.3451876 0.5137000 0.3081085 

可以看出这个结果和前面流程算出来的结果非常类似。使用这个新的size Factor进行差异表达分析:

dds2 <- DESeq(dds2)
res2 <- results(dds2, contrast=c("group", "stroke", "ctrl"))
head(res2)
log2 fold change (MLE): group stroke vs ctrl 
Wald test p-value: group stroke vs ctrl 
DataFrame with 6 rows and 6 columns
                               baseMean log2FoldChange     lfcSE
                              <numeric>      <numeric> <numeric>
ENSG00000223972.5 DDX11L1       41.1031        2.79453  0.821098
ENSG00000227232.5 WASH7P       197.4425        1.29793  0.297050
ENSG00000278267.1 MIR6859-1      0.0000             NA        NA
ENSG00000243485.5 MIR1302-2HG    0.0000             NA        NA
ENSG00000284332.1 MIR1302-2      0.0000             NA        NA
ENSG00000237613.2 FAM138A        0.0000             NA        NA
                                   stat      pvalue        padj
                              <numeric>   <numeric>   <numeric>
ENSG00000223972.5 DDX11L1       3.40340 6.65525e-04 2.72921e-03
ENSG00000227232.5 WASH7P        4.36938 1.24598e-05 7.86367e-05
ENSG00000278267.1 MIR6859-1          NA          NA          NA
ENSG00000243485.5 MIR1302-2HG        NA          NA          NA
ENSG00000284332.1 MIR1302-2          NA          NA          NA
ENSG00000237613.2 FAM138A            NA          NA          NA

Likelihood ratio test

design matrix

The Wald test检验的是log2 fold change是否为0,当有多个因素的时候,可以使用Likelihood ratio test (LRT)。为了演示的方便,我们假设测试数据来自于2个批次:

sampleT <- data.frame(sampleT, batch = factor(rep(c("first", "second"), 3)))
sampleT
         group  batch
ctrl1     ctrl  first
ctrl2     ctrl second
ctrl3     ctrl  first
stroke1 stroke second
stroke2 stroke  first
stroke3 stroke second

差异分析校正批次

dds3 <- DESeqDataSetFromTximport(txi, sampleT, ~ batch + group)
dds3 <- DESeq(dds3, test="LRT", reduced= ~ batch)
res3 <- results(dds3)
head(res3)
log2 fold change (MLE): group stroke vs ctrl 
LRT p-value: '~ batch + group' vs '~ batch' 
DataFrame with 6 rows and 6 columns
          baseMean log2FoldChange     lfcSE       stat    pvalue
         <numeric>      <numeric> <numeric>  <numeric> <numeric>
A1BG      138.9053      0.3788450  0.332901 1.30135719 0.2539655
A1BG-AS1   94.0295     -0.1443174  0.335470 0.18675073 0.6656348
A1CF       12.2568     -0.0986076  1.033360 0.00821582 0.9277777
A2M       175.9281     -1.4112335  0.673109 3.33778411 0.0677057
A2M-AS1    70.6930     -0.7652698  0.515541 2.15078246 0.1424973
A2ML1      20.0849     -0.3870942  0.747196 0.26467387 0.6069272
              padj
         <numeric>
A1BG      0.448675
A1BG-AS1  0.808947
A1CF      0.964256
A2M       0.172948
A2M-AS1   0.301514
A2ML1     0.768356

结果比较

DEG = rep("NC", nrow(res3))
DEG[(res3$log2FoldChange > 0) & (res3$padj < 0.05)] = "UP"
DEG[(res3$log2FoldChange < 0) & (res3$padj < 0.05)] = "DN"
res3$DEG = DEG
table(res$DEG, res3$DEG)
    
        DN    NC    UP
  DN  3740   626     0
  NC   101 51409    66
  UP     0   925  3724

可以看出两种结果的重合度非常高。

结果保存和绘图

绘图展示

我们可以使用EnhancedVolcano以及pheatmap来展示分析的结果。

输出分析结果

write.table(res3, file = "DESeq2_res.txt", row.names = TRUE, col.names=TRUE, sep = "\t", quote = FALSE)