Bioinformatics training course: bulk RNA-seq: Bulk RNA-seq的差异表达分析

全局设定

我们将使用系统内安装的R：

/sibcb/program/install/r-4.0/bin/R

设定R的library path：

.libPaths('/sibcb1/bioinformatics/training_shared/software/rlib/r-4.0_lib/')
library("tximport")
library("reshape2")
library("DESeq2")
library("ggplot2")
library("pheatmap")
library("RColorBrewer")
library("pasilla")

开始前的文件准备：

ln -s /sibcb1/bioinformatics/biocourse/biocourse01/bulkRNAseq/idx .
ln -s /sibcb1/bioinformatics/biocourse/biocourse01/bulkRNAseq/alignments .
ln -s /sibcb1/bioinformatics/biocourse/biocourse01/bulkRNAseq/kallisto .

读入Kallistio的定量结果

定量结果

SRX = list.files("kallisto")
files = paste0("kallisto/", SRX, "/abundance.tsv")
names(files) = SRX
cbind(files)

        files                           
ctrl1   "kallisto/ctrl1/abundance.tsv"  
ctrl2   "kallisto/ctrl2/abundance.tsv"  
ctrl3   "kallisto/ctrl3/abundance.tsv"  
stroke1 "kallisto/stroke1/abundance.tsv"
stroke2 "kallisto/stroke2/abundance.tsv"
stroke3 "kallisto/stroke3/abundance.tsv"

基因注释表

tx2gene = read.table(file = 'idx/kallisto/tx2g_symbol.txt', row.names = NULL, header = FALSE, sep = "\t")
head(tx2gene)

                 V1          V2                V3
1 ENST00000456328.2     DDX11L1 ENSG00000223972.5
2 ENST00000450305.2     DDX11L1 ENSG00000223972.5
3 ENST00000488147.1      WASH7P ENSG00000227232.5
4 ENST00000619216.1   MIR6859-1 ENSG00000278267.1
5 ENST00000473358.1 MIR1302-2HG ENSG00000243485.5
6 ENST00000469289.1 MIR1302-2HG ENSG00000243485.5

tx2gene = tx2gene[, c(1,3)]

tximport

txi <- tximport(files, type = "kallisto", tx2gene = tx2gene,ignoreAfterBar = TRUE)
names(txi)

[1] "abundance"           "counts"              "infReps"            
[4] "length"              "countsFromAbundance"

相对表达量 (TPM)：

tpm = log2(txi$abundance + 1)
head(txi$abundance)

                        ctrl1       ctrl2       ctrl3     stroke1
ENSG00000000003.15   0.144827   0.2413849   0.2294918  0.01834390
ENSG00000000005.6    0.000000   0.0000000   0.0000000  0.00000000
ENSG00000000419.12   4.464381   4.5101390   3.6269030  0.05903982
ENSG00000000457.14   3.327530   3.8490000   3.8062800  0.22330080
ENSG00000000460.17   1.437791   1.4126760   1.6546001  0.12307160
ENSG00000000938.13 141.307015 151.9172000 113.5824000 16.15440000
                        stroke2     stroke3
ENSG00000000003.15  0.003318682  0.02135593
ENSG00000000005.6   0.000000000  0.00000000
ENSG00000000419.12  0.336553980  0.23208100
ENSG00000000457.14  0.387501002  0.21362977
ENSG00000000460.17  0.106744900  0.12046514
ENSG00000000938.13 27.043837090 11.38226400

质量控制

design matirx

sampleT = data.frame(group = gsub("[123]", "", colnames(tpm)), row.names = colnames(tpm))
sampleT

         group
ctrl1     ctrl
ctrl2     ctrl
ctrl3     ctrl
stroke1 stroke
stroke2 stroke
stroke3 stroke

PCA

value <- apply(tpm, 1, sd)
idx <- value > quantile(value, 1-2000/nrow(tpm)) 
subM <- tpm[idx, ]

pca <- summary(prcomp(t(subM), center=TRUE, scale=TRUE))
pcadata <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2])
pcaframe <- data.frame(pcadata, sampleT)
pcaframe

              PC1        PC2  group
ctrl1   -36.81205  5.3036163   ctrl
ctrl2   -42.74117 -7.0702261   ctrl
ctrl3   -40.94456  1.9504036   ctrl
stroke1  42.38749  0.7511756 stroke
stroke2  33.58928  1.5833694 stroke
stroke3  44.52101 -2.5183389 stroke

p <- ggplot(pcaframe, aes(x=PC1, y=PC2, color=group)) + geom_point(size=1.5)
p <- p + theme_bw() + labs(x = paste0("PC1 (", pca$importance[2,"PC1"]*100, "%)"), y = paste0("PC2 (", pca$importance[2,"PC2"]*100, "%)"))
p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p

Heatmap

pheatmap(subM, 
    cluster_rows = T, 
    cluster_cols = F, 
    show_rownames = F,
    annotation = sampleT, 
    border_color = NA,    
    fontsize = 12, 
    scale = "row", 
    rev(RColorBrewer::brewer.pal(n = 10, name = "RdBu")))

差异表达分析

构建DESeq2对象

sampleT$group = factor(sampleT$group)
sampleT$group

[1] ctrl   ctrl   ctrl   stroke stroke stroke
Levels: ctrl stroke

dds <- DESeqDataSetFromTximport(txi, sampleT, ~group)
dds

class: DESeqDataSet 
dim: 60669 6 
metadata(1): version
assays(2): counts avgTxLength
rownames(60669): ENSG00000000003.15 ENSG00000000005.6 ...
  ENSG00000288603.1 ENSG00000288604.1
rowData names(0):
colnames(6): ctrl1 ctrl2 ... stroke2 stroke3
colData names(1): group

差异表达分析

dds <- DESeq(dds)
res <- results(dds, contrast=c("group", "stroke", "ctrl"))
head(res)

log2 fold change (MLE): group stroke vs ctrl 
Wald test p-value: group stroke vs ctrl 
DataFrame with 6 rows and 6 columns
                    baseMean log2FoldChange     lfcSE      stat
                   <numeric>      <numeric> <numeric> <numeric>
ENSG00000000003.15   16.2739     -0.1400929  0.781342 -0.179298
ENSG00000000005.6     0.0000             NA        NA        NA
ENSG00000000419.12   55.3414     -0.8367420  0.499906 -1.673798
ENSG00000000457.14  192.5471     -0.2257509  0.228626 -0.987423
ENSG00000000460.17   95.5954     -0.0979387  0.354460 -0.276304
ENSG00000000938.13 7187.2502      0.5671468  0.222620  2.547595
                      pvalue      padj
                   <numeric> <numeric>
ENSG00000000003.15 0.8577039 0.9207711
ENSG00000000005.6         NA        NA
ENSG00000000419.12 0.0941702 0.2093662
ENSG00000000457.14 0.3234352 0.5056387
ENSG00000000460.17 0.7823147 0.8753066
ENSG00000000938.13 0.0108468 0.0376585

DEG = rep("NC", nrow(res))
DEG[(res$log2FoldChange > 0) & (res$padj < 0.05)] = "UP"
DEG[(res$log2FoldChange < 0) & (res$padj < 0.05)] = "DN"
res$DEG = DEG
summary(factor(DEG))

   DN    NC    UP 
 4374 51643  4652

火山图

res$LogFDR <- -log10(res$padj)

p <- ggplot(data.frame(res), aes(x = log2FoldChange, y = LogFDR, color = DEG)) + xlim(-20, 20)
p <- p + theme_bw() + labs(x = "Log2 fold-change", y = "-log10(FDR)", title = "")
p <- p + geom_point(size = 0.1)
p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + scale_color_manual(values = c("UP" = "red", "DN" = "dodgerblue4", "NC" = "grey"))
p

查看size factor

需要注意的是，在tximport流程中，每个样本的每个基因有不同的size Factor:

sm = normalizationFactors(dds)
head(sm)

                      ctrl1    ctrl2    ctrl3   stroke1   stroke2
ENSG00000000003.15 2.686860 2.408868 2.188957 0.3956640 0.5585808
ENSG00000000005.6  2.572125 2.498673 2.564378 0.3766075 0.5309121
ENSG00000000419.12 3.125067 3.068886 3.194642 0.3213788 0.3866812
ENSG00000000457.14 3.010153 3.017888 2.868701 0.3041469 0.4341758
ENSG00000000460.17 2.587701 2.747982 2.416827 0.3902568 0.5068346
ENSG00000000938.13 2.644581 2.744766 2.857571 0.3430555 0.4726049
                     stroke3
ENSG00000000003.15 0.3193687
ENSG00000000005.6  0.3034625
ENSG00000000419.12 0.2626440
ENSG00000000457.14 0.2905856
ENSG00000000460.17 0.2941777
ENSG00000000938.13 0.2973570

如果想查看样本水平的size Factor:

nm <- assays(dds)[["avgTxLength"]]
sf <- estimateSizeFactorsForMatrix(counts(dds) / nm)
sf

    ctrl1     ctrl2     ctrl3   stroke1   stroke2   stroke3 
2.6208466 2.5460034 2.6129527 0.3837413 0.5409688 0.3092107

使用spike-in （STAR）

读入gene-level counts

rpgFiles <- list.files('alignments', 'ReadsPerGene')
rpg <- list()
for(i in 1:length(rpgFiles)){
  rpg[[i]] <- read.table(file = paste0('alignments/', rpgFiles[i]), row.names = 1, header = FALSE, sep = '\t', skip = 4)
}
names(rpg) <- sapply(strsplit(rpgFiles, 'Reads'), function(z) z[1])

PG <- unique(unlist(lapply(rpg, rownames)))
counts <- matrix(0, length(PG), length(rpg))
dimnames(counts) <- list(PG, names(rpg))
for(i in 1:length(rpg)){
  tx <- rpg[[i]]
  counts[rownames(tx), i] = tx[,1]
}
counts <- counts[rowSums(counts) >0, ]
head(counts)

                  ctrl1 ctrl2 ctrl3 stroke1 stroke2 stroke3
ENSG00000223972.5    61     4    12      17      18      43
ENSG00000227232.5    94    80    49      31      54      37
ENSG00000278267.1     0     6     1       0       3       0
ENSG00000238009.6     3     2    10       8       2       2
ENSG00000233750.3    89    37   182     138     144      22
ENSG00000268903.1   119    62   201     125     135      34

构建对象：

dds2 <- DESeqDataSetFromMatrix(countData = counts,
                              colData = sampleT,
                              design = ~ group)
dds2

class: DESeqDataSet 
dim: 39787 6 
metadata(1): version
assays(1): counts
rownames(39787): ENSG00000223972.5 ENSG00000227232.5 ...
  ENSG00000278817.1 ENSG00000277196.4
rowData names(0):
colnames(6): ctrl1 ctrl2 ... stroke2 stroke3
colData names(1): group

重置size factor

dds2 = estimateSizeFactors(dds2)
sizeFactors(dds2)

    ctrl1     ctrl2     ctrl3   stroke1   stroke2   stroke3 
2.7894032 2.8480426 2.8474086 0.3364895 0.4881396 0.2909063

sizeFactors(dds2) <- sf
sizeFactors(dds2)

    ctrl1     ctrl2     ctrl3   stroke1   stroke2   stroke3 
2.6208466 2.5460034 2.6129527 0.3837413 0.5409688 0.3092107

dds2 <- DESeq(dds2)
res2 <- results(dds2, contrast=c("group", "stroke", "ctrl"))
head(res2)

log2 fold change (MLE): group stroke vs ctrl 
Wald test p-value: group stroke vs ctrl 
DataFrame with 6 rows and 6 columns
                   baseMean log2FoldChange     lfcSE      stat
                  <numeric>      <numeric> <numeric> <numeric>
ENSG00000223972.5  41.01276        2.86646  0.872722  3.284507
ENSG00000227232.5  64.38413        1.79795  0.372777  4.823126
ENSG00000278267.1   1.38082        1.13939  2.305405  0.494226
ENSG00000238009.6   6.12831        2.41026  1.063885  2.265528
ENSG00000233750.3 135.84989        2.56410  0.634637  4.040269
ENSG00000268903.1 138.65525        2.22815  0.502144  4.437265
                       pvalue        padj
                    <numeric>   <numeric>
ENSG00000223972.5 1.02161e-03 3.70330e-03
ENSG00000227232.5 1.41325e-06 8.92772e-06
ENSG00000278267.1 6.21147e-01          NA
ENSG00000238009.6 2.34803e-02 5.78889e-02
ENSG00000233750.3 5.33900e-05 2.53527e-04
ENSG00000268903.1 9.11089e-06 5.02225e-05

DEG = rep("NC", nrow(res2))
DEG[(res2$log2FoldChange > 0) & (res2$padj < 0.05)] = "UP"
DEG[(res2$log2FoldChange < 0) & (res2$padj < 0.05)] = "DN"
res2$DEG = DEG
head(res2)

log2 fold change (MLE): group stroke vs ctrl 
Wald test p-value: group stroke vs ctrl 
DataFrame with 6 rows and 7 columns
                   baseMean log2FoldChange     lfcSE      stat
                  <numeric>      <numeric> <numeric> <numeric>
ENSG00000223972.5  41.01276        2.86646  0.872722  3.284507
ENSG00000227232.5  64.38413        1.79795  0.372777  4.823126
ENSG00000278267.1   1.38082        1.13939  2.305405  0.494226
ENSG00000238009.6   6.12831        2.41026  1.063885  2.265528
ENSG00000233750.3 135.84989        2.56410  0.634637  4.040269
ENSG00000268903.1 138.65525        2.22815  0.502144  4.437265
                       pvalue        padj         DEG
                    <numeric>   <numeric> <character>
ENSG00000223972.5 1.02161e-03 3.70330e-03          UP
ENSG00000227232.5 1.41325e-06 8.92772e-06          UP
ENSG00000278267.1 6.21147e-01          NA          NC
ENSG00000238009.6 2.34803e-02 5.78889e-02          NC
ENSG00000233750.3 5.33900e-05 2.53527e-04          UP
ENSG00000268903.1 9.11089e-06 5.02225e-05          UP

比较结果

我们比较一下Kallisto和STAR的分析结果：

sharedGenes <- intersect(rownames(res), rownames(res2))
shared_res  <- res[sharedGenes, ]
shared_res2 <- res2[sharedGenes, ]
table(shared_res$DEG, shared_res2$DEG)

    
        DN    NC    UP
  DN  3766   521     5
  NC  2369 27953   743
  UP    49  1111  3249

Likelihood ratio test

design matrix

The Wald test检验的是log2 fold change是否为0，当有多个因素的时候，可以使用Likelihood ratio test (LRT)。为了演示的方便，我们假设测试数据来自于2个批次：

sampleT <- data.frame(sampleT, batch = factor(rep(c("first", "second"), 3)))
sampleT

         group  batch
ctrl1     ctrl  first
ctrl2     ctrl second
ctrl3     ctrl  first
stroke1 stroke second
stroke2 stroke  first
stroke3 stroke second

差异分析校正批次

dds3 <- DESeqDataSetFromTximport(txi, sampleT, ~ batch + group)
dds3 <- DESeq(dds3, test="LRT", reduced= ~ batch)
res3 <- results(dds3)
head(res3)

log2 fold change (MLE): group stroke vs ctrl 
LRT p-value: '~ batch + group' vs '~ batch' 
DataFrame with 6 rows and 6 columns
                    baseMean log2FoldChange     lfcSE      stat
                   <numeric>      <numeric> <numeric> <numeric>
ENSG00000000003.15   16.2739      -0.914923  0.850747  0.858527
ENSG00000000005.6     0.0000             NA        NA        NA
ENSG00000000419.12   55.3414      -0.799549  0.587278  1.911276
ENSG00000000457.14  192.5471      -0.219561  0.276310  0.635605
ENSG00000000460.17   95.5954      -0.227161  0.408279  0.270077
ENSG00000000938.13 7187.2502       0.579498  0.265001  4.942615
                      pvalue      padj
                   <numeric> <numeric>
ENSG00000000003.15 0.3541513 0.5536369
ENSG00000000005.6         NA        NA
ENSG00000000419.12 0.1668216 0.3369992
ENSG00000000457.14 0.4253069 0.6201386
ENSG00000000460.17 0.6032804 0.7662977
ENSG00000000938.13 0.0262024 0.0822499

结果比较

DEG = rep("NC", nrow(res3))
DEG[(res3$log2FoldChange > 0) & (res3$padj < 0.05)] = "UP"
DEG[(res3$log2FoldChange < 0) & (res3$padj < 0.05)] = "DN"
res3$DEG = DEG
table(res$DEG, res3$DEG)

    
        DN    NC    UP
  DN  3747   627     0
  NC   103 51474    66
  UP     0   924  3728

可以看出两种结果的重合度非常高。

结果保存和绘图

绘图展示

我们可以使用EnhancedVolcano以及pheatmap来展示分析的结果。

输出分析结果

write.table(res3, file = "DESeq2_res.txt", row.names = TRUE, col.names=TRUE, sep = "\t", quote = FALSE)