我们将使用系统内安装的R:
/sibcb/program/install/r-4.0/bin/R
设定R的library path:
开始前的文件准备:
ln -s /sibcb1/bioinformatics/biocourse/biocourse01/bulkRNAseq/idx .
ln -s /sibcb1/bioinformatics/biocourse/biocourse01/bulkRNAseq/alignments .
ln -s /sibcb1/bioinformatics/biocourse/biocourse01/bulkRNAseq/kallisto .
SRX = list.files("kallisto")
files = paste0("kallisto/", SRX, "/abundance.tsv")
names(files) = SRX
cbind(files)
files
ctrl1 "kallisto/ctrl1/abundance.tsv"
ctrl2 "kallisto/ctrl2/abundance.tsv"
ctrl3 "kallisto/ctrl3/abundance.tsv"
stroke1 "kallisto/stroke1/abundance.tsv"
stroke2 "kallisto/stroke2/abundance.tsv"
stroke3 "kallisto/stroke3/abundance.tsv"
tx2gene = read.table(file = 'idx/kallisto/tx2g_symbol.txt', row.names = NULL, header = FALSE, sep = "\t")
head(tx2gene)
V1 V2 V3
1 ENST00000456328.2 DDX11L1 ENSG00000223972.5
2 ENST00000450305.2 DDX11L1 ENSG00000223972.5
3 ENST00000488147.1 WASH7P ENSG00000227232.5
4 ENST00000619216.1 MIR6859-1 ENSG00000278267.1
5 ENST00000473358.1 MIR1302-2HG ENSG00000243485.5
6 ENST00000469289.1 MIR1302-2HG ENSG00000243485.5
tx2gene = tx2gene[, c(1,3)]
[1] "abundance" "counts" "infReps"
[4] "length" "countsFromAbundance"
相对表达量 (TPM):
ctrl1 ctrl2 ctrl3 stroke1
ENSG00000000003.15 0.144827 0.2413849 0.2294918 0.01834390
ENSG00000000005.6 0.000000 0.0000000 0.0000000 0.00000000
ENSG00000000419.12 4.464381 4.5101390 3.6269030 0.05903982
ENSG00000000457.14 3.327530 3.8490000 3.8062800 0.22330080
ENSG00000000460.17 1.437791 1.4126760 1.6546001 0.12307160
ENSG00000000938.13 141.307015 151.9172000 113.5824000 16.15440000
stroke2 stroke3
ENSG00000000003.15 0.003318682 0.02135593
ENSG00000000005.6 0.000000000 0.00000000
ENSG00000000419.12 0.336553980 0.23208100
ENSG00000000457.14 0.387501002 0.21362977
ENSG00000000460.17 0.106744900 0.12046514
ENSG00000000938.13 27.043837090 11.38226400
sampleT = data.frame(group = gsub("[123]", "", colnames(tpm)), row.names = colnames(tpm))
sampleT
group
ctrl1 ctrl
ctrl2 ctrl
ctrl3 ctrl
stroke1 stroke
stroke2 stroke
stroke3 stroke
value <- apply(tpm, 1, sd)
idx <- value > quantile(value, 1-2000/nrow(tpm))
subM <- tpm[idx, ]
pca <- summary(prcomp(t(subM), center=TRUE, scale=TRUE))
pcadata <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2])
pcaframe <- data.frame(pcadata, sampleT)
pcaframe
PC1 PC2 group
ctrl1 -36.81205 5.3036163 ctrl
ctrl2 -42.74117 -7.0702261 ctrl
ctrl3 -40.94456 1.9504036 ctrl
stroke1 42.38749 0.7511756 stroke
stroke2 33.58928 1.5833694 stroke
stroke3 44.52101 -2.5183389 stroke
p <- ggplot(pcaframe, aes(x=PC1, y=PC2, color=group)) + geom_point(size=1.5)
p <- p + theme_bw() + labs(x = paste0("PC1 (", pca$importance[2,"PC1"]*100, "%)"), y = paste0("PC2 (", pca$importance[2,"PC2"]*100, "%)"))
p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p
pheatmap(subM,
cluster_rows = T,
cluster_cols = F,
show_rownames = F,
annotation = sampleT,
border_color = NA,
fontsize = 12,
scale = "row",
rev(RColorBrewer::brewer.pal(n = 10, name = "RdBu")))
sampleT$group = factor(sampleT$group)
sampleT$group
[1] ctrl ctrl ctrl stroke stroke stroke
Levels: ctrl stroke
dds <- DESeqDataSetFromTximport(txi, sampleT, ~group)
dds
class: DESeqDataSet
dim: 60669 6
metadata(1): version
assays(2): counts avgTxLength
rownames(60669): ENSG00000000003.15 ENSG00000000005.6 ...
ENSG00000288603.1 ENSG00000288604.1
rowData names(0):
colnames(6): ctrl1 ctrl2 ... stroke2 stroke3
colData names(1): group
log2 fold change (MLE): group stroke vs ctrl
Wald test p-value: group stroke vs ctrl
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
ENSG00000000003.15 16.2739 -0.1400929 0.781342 -0.179298
ENSG00000000005.6 0.0000 NA NA NA
ENSG00000000419.12 55.3414 -0.8367420 0.499906 -1.673798
ENSG00000000457.14 192.5471 -0.2257509 0.228626 -0.987423
ENSG00000000460.17 95.5954 -0.0979387 0.354460 -0.276304
ENSG00000000938.13 7187.2502 0.5671468 0.222620 2.547595
pvalue padj
<numeric> <numeric>
ENSG00000000003.15 0.8577039 0.9207711
ENSG00000000005.6 NA NA
ENSG00000000419.12 0.0941702 0.2093662
ENSG00000000457.14 0.3234352 0.5056387
ENSG00000000460.17 0.7823147 0.8753066
ENSG00000000938.13 0.0108468 0.0376585
DEG = rep("NC", nrow(res))
DEG[(res$log2FoldChange > 0) & (res$padj < 0.05)] = "UP"
DEG[(res$log2FoldChange < 0) & (res$padj < 0.05)] = "DN"
res$DEG = DEG
summary(factor(DEG))
DN NC UP
4374 51643 4652
res$LogFDR <- -log10(res$padj)
p <- ggplot(data.frame(res), aes(x = log2FoldChange, y = LogFDR, color = DEG)) + xlim(-20, 20)
p <- p + theme_bw() + labs(x = "Log2 fold-change", y = "-log10(FDR)", title = "")
p <- p + geom_point(size = 0.1)
p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + scale_color_manual(values = c("UP" = "red", "DN" = "dodgerblue4", "NC" = "grey"))
p
需要注意的是,在tximport
流程中,每个样本的每个基因有不同的size Factor
:
sm = normalizationFactors(dds)
head(sm)
ctrl1 ctrl2 ctrl3 stroke1 stroke2
ENSG00000000003.15 2.686860 2.408868 2.188957 0.3956640 0.5585808
ENSG00000000005.6 2.572125 2.498673 2.564378 0.3766075 0.5309121
ENSG00000000419.12 3.125067 3.068886 3.194642 0.3213788 0.3866812
ENSG00000000457.14 3.010153 3.017888 2.868701 0.3041469 0.4341758
ENSG00000000460.17 2.587701 2.747982 2.416827 0.3902568 0.5068346
ENSG00000000938.13 2.644581 2.744766 2.857571 0.3430555 0.4726049
stroke3
ENSG00000000003.15 0.3193687
ENSG00000000005.6 0.3034625
ENSG00000000419.12 0.2626440
ENSG00000000457.14 0.2905856
ENSG00000000460.17 0.2941777
ENSG00000000938.13 0.2973570
如果想查看样本水平的size Factor
:
nm <- assays(dds)[["avgTxLength"]]
sf <- estimateSizeFactorsForMatrix(counts(dds) / nm)
sf
ctrl1 ctrl2 ctrl3 stroke1 stroke2 stroke3
2.6208466 2.5460034 2.6129527 0.3837413 0.5409688 0.3092107
rpgFiles <- list.files('alignments', 'ReadsPerGene')
rpg <- list()
for(i in 1:length(rpgFiles)){
rpg[[i]] <- read.table(file = paste0('alignments/', rpgFiles[i]), row.names = 1, header = FALSE, sep = '\t', skip = 4)
}
names(rpg) <- sapply(strsplit(rpgFiles, 'Reads'), function(z) z[1])
PG <- unique(unlist(lapply(rpg, rownames)))
counts <- matrix(0, length(PG), length(rpg))
dimnames(counts) <- list(PG, names(rpg))
for(i in 1:length(rpg)){
tx <- rpg[[i]]
counts[rownames(tx), i] = tx[,1]
}
counts <- counts[rowSums(counts) >0, ]
head(counts)
ctrl1 ctrl2 ctrl3 stroke1 stroke2 stroke3
ENSG00000223972.5 61 4 12 17 18 43
ENSG00000227232.5 94 80 49 31 54 37
ENSG00000278267.1 0 6 1 0 3 0
ENSG00000238009.6 3 2 10 8 2 2
ENSG00000233750.3 89 37 182 138 144 22
ENSG00000268903.1 119 62 201 125 135 34
构建对象:
dds2 <- DESeqDataSetFromMatrix(countData = counts,
colData = sampleT,
design = ~ group)
dds2
class: DESeqDataSet
dim: 39787 6
metadata(1): version
assays(1): counts
rownames(39787): ENSG00000223972.5 ENSG00000227232.5 ...
ENSG00000278817.1 ENSG00000277196.4
rowData names(0):
colnames(6): ctrl1 ctrl2 ... stroke2 stroke3
colData names(1): group
dds2 = estimateSizeFactors(dds2)
sizeFactors(dds2)
ctrl1 ctrl2 ctrl3 stroke1 stroke2 stroke3
2.7894032 2.8480426 2.8474086 0.3364895 0.4881396 0.2909063
sizeFactors(dds2) <- sf
sizeFactors(dds2)
ctrl1 ctrl2 ctrl3 stroke1 stroke2 stroke3
2.6208466 2.5460034 2.6129527 0.3837413 0.5409688 0.3092107
log2 fold change (MLE): group stroke vs ctrl
Wald test p-value: group stroke vs ctrl
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
ENSG00000223972.5 41.01276 2.86646 0.872722 3.284507
ENSG00000227232.5 64.38413 1.79795 0.372777 4.823126
ENSG00000278267.1 1.38082 1.13939 2.305405 0.494226
ENSG00000238009.6 6.12831 2.41026 1.063885 2.265528
ENSG00000233750.3 135.84989 2.56410 0.634637 4.040269
ENSG00000268903.1 138.65525 2.22815 0.502144 4.437265
pvalue padj
<numeric> <numeric>
ENSG00000223972.5 1.02161e-03 3.70330e-03
ENSG00000227232.5 1.41325e-06 8.92772e-06
ENSG00000278267.1 6.21147e-01 NA
ENSG00000238009.6 2.34803e-02 5.78889e-02
ENSG00000233750.3 5.33900e-05 2.53527e-04
ENSG00000268903.1 9.11089e-06 5.02225e-05
DEG = rep("NC", nrow(res2))
DEG[(res2$log2FoldChange > 0) & (res2$padj < 0.05)] = "UP"
DEG[(res2$log2FoldChange < 0) & (res2$padj < 0.05)] = "DN"
res2$DEG = DEG
head(res2)
log2 fold change (MLE): group stroke vs ctrl
Wald test p-value: group stroke vs ctrl
DataFrame with 6 rows and 7 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
ENSG00000223972.5 41.01276 2.86646 0.872722 3.284507
ENSG00000227232.5 64.38413 1.79795 0.372777 4.823126
ENSG00000278267.1 1.38082 1.13939 2.305405 0.494226
ENSG00000238009.6 6.12831 2.41026 1.063885 2.265528
ENSG00000233750.3 135.84989 2.56410 0.634637 4.040269
ENSG00000268903.1 138.65525 2.22815 0.502144 4.437265
pvalue padj DEG
<numeric> <numeric> <character>
ENSG00000223972.5 1.02161e-03 3.70330e-03 UP
ENSG00000227232.5 1.41325e-06 8.92772e-06 UP
ENSG00000278267.1 6.21147e-01 NA NC
ENSG00000238009.6 2.34803e-02 5.78889e-02 NC
ENSG00000233750.3 5.33900e-05 2.53527e-04 UP
ENSG00000268903.1 9.11089e-06 5.02225e-05 UP
我们比较一下Kallisto和STAR的分析结果:
sharedGenes <- intersect(rownames(res), rownames(res2))
shared_res <- res[sharedGenes, ]
shared_res2 <- res2[sharedGenes, ]
table(shared_res$DEG, shared_res2$DEG)
DN NC UP
DN 3766 521 5
NC 2369 27953 743
UP 49 1111 3249
The Wald test检验的是log2 fold change是否为0,当有多个因素的时候,可以使用Likelihood ratio test (LRT)。为了演示的方便,我们假设测试数据来自于2个批次:
sampleT <- data.frame(sampleT, batch = factor(rep(c("first", "second"), 3)))
sampleT
group batch
ctrl1 ctrl first
ctrl2 ctrl second
ctrl3 ctrl first
stroke1 stroke second
stroke2 stroke first
stroke3 stroke second
dds3 <- DESeqDataSetFromTximport(txi, sampleT, ~ batch + group)
dds3 <- DESeq(dds3, test="LRT", reduced= ~ batch)
res3 <- results(dds3)
head(res3)
log2 fold change (MLE): group stroke vs ctrl
LRT p-value: '~ batch + group' vs '~ batch'
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
ENSG00000000003.15 16.2739 -0.914923 0.850747 0.858527
ENSG00000000005.6 0.0000 NA NA NA
ENSG00000000419.12 55.3414 -0.799549 0.587278 1.911276
ENSG00000000457.14 192.5471 -0.219561 0.276310 0.635605
ENSG00000000460.17 95.5954 -0.227161 0.408279 0.270077
ENSG00000000938.13 7187.2502 0.579498 0.265001 4.942615
pvalue padj
<numeric> <numeric>
ENSG00000000003.15 0.3541513 0.5536369
ENSG00000000005.6 NA NA
ENSG00000000419.12 0.1668216 0.3369992
ENSG00000000457.14 0.4253069 0.6201386
ENSG00000000460.17 0.6032804 0.7662977
ENSG00000000938.13 0.0262024 0.0822499
DEG = rep("NC", nrow(res3))
DEG[(res3$log2FoldChange > 0) & (res3$padj < 0.05)] = "UP"
DEG[(res3$log2FoldChange < 0) & (res3$padj < 0.05)] = "DN"
res3$DEG = DEG
table(res$DEG, res3$DEG)
DN NC UP
DN 3747 627 0
NC 103 51474 66
UP 0 924 3728
可以看出两种结果的重合度非常高。
我们可以使用EnhancedVolcano以及pheatmap来展示分析的结果。
write.table(res3, file = "DESeq2_res.txt", row.names = TRUE, col.names=TRUE, sep = "\t", quote = FALSE)