我们将使用系统内安装的R:
/sibcb/program/install/r-4.0/bin/R
设定R的library path:
开始前的文件准备:
ln -s /sibcb1/bioinformatics/biocourse/biocourse01/bulkRNAseq/idx
ln -s /sibcb1/bioinformatics/biocourse/biocourse01/bulkRNAseq/featureCounts
ln -s /sibcb1/bioinformatics/biocourse/biocourse01/bulkRNAseq/kallisto
SRX = list.files("kallisto")
files = paste0("kallisto/", SRX, "/abundance.tsv")
names(files) = SRX
cbind(files)
files
ctrl1 "kallisto/ctrl1/abundance.tsv"
ctrl2 "kallisto/ctrl2/abundance.tsv"
ctrl3 "kallisto/ctrl3/abundance.tsv"
stroke1 "kallisto/stroke1/abundance.tsv"
stroke2 "kallisto/stroke2/abundance.tsv"
stroke3 "kallisto/stroke3/abundance.tsv"
tx2gene = read.table(file = 'idx/kallisto/tx2g_symbol.txt', row.names = NULL, header = FALSE, sep = "\t")
head(tx2gene)
V1 V2 V3
1 ENST00000456328.2 DDX11L1 ENSG00000223972.5
2 ENST00000450305.2 DDX11L1 ENSG00000223972.5
3 ENST00000488147.1 WASH7P ENSG00000227232.5
4 ENST00000619216.1 MIR6859-1 ENSG00000278267.1
5 ENST00000473358.1 MIR1302-2HG ENSG00000243485.5
6 ENST00000469289.1 MIR1302-2HG ENSG00000243485.5
[1] "abundance" "counts" "infReps"
[4] "length" "countsFromAbundance"
相对表达量 (TPM):
ctrl1 ctrl2 ctrl3 stroke1 stroke2
A1BG 7.0668450 12.0477869 6.8267015 0.91054220 1.176894900
A1BG-AS1 2.1620080 3.1512340 2.0337040 0.19374250 0.246123092
A1CF 0.0575614 0.0635564 0.0664542 0.01326064 0.005202681
A2M 7.5932940 6.7255700 21.7159070 0.14195205 0.359023400
A2M-AS1 0.8916910 1.0821252 2.4603140 0.05171970 0.111497000
A2ML1 0.4142338 0.6031055 0.6026923 0.06351190 0.044580240
stroke3
A1BG 1.12601700
A1BG-AS1 0.18466270
A1CF 0.00000000
A2M 0.48542780
A2M-AS1 0.04178862
A2ML1 0.00966699
sampleT = data.frame(group = gsub("[123]", "", colnames(tpm)), row.names = colnames(tpm))
sampleT
group
ctrl1 ctrl
ctrl2 ctrl
ctrl3 ctrl
stroke1 stroke
stroke2 stroke
stroke3 stroke
value <- apply(tpm, 1, sd)
idx <- value > quantile(value, 1-2000/nrow(tpm))
subM <- tpm[idx, ]
pca <- summary(prcomp(t(subM), center=TRUE, scale=TRUE))
pcadata <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2])
pcaframe <- data.frame(pcadata, sampleT)
pcaframe
PC1 PC2 group
ctrl1 -36.81307 5.3101120 ctrl
ctrl2 -42.74179 -7.0657879 ctrl
ctrl3 -40.94367 1.9394916 ctrl
stroke1 42.37128 0.7496646 stroke
stroke2 33.60247 1.5852164 stroke
stroke3 44.52478 -2.5186967 stroke
p <- ggplot(pcaframe, aes(x=PC1, y=PC2, color=group)) + geom_point(size=1.5)
p <- p + theme_bw() + labs(x = paste0("PC1 (", pca$importance[2,"PC1"]*100, "%)"), y = paste0("PC2 (", pca$importance[2,"PC2"]*100, "%)"))
p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p
pheatmap(subM,
cluster_rows = T,
cluster_cols = F,
show_rownames = F,
annotation = sampleT,
border_color = NA,
fontsize = 12,
scale = "row",
rev(RColorBrewer::brewer.pal(n = 10, name = "RdBu")))
sampleT$group = factor(sampleT$group)
sampleT$group
[1] ctrl ctrl ctrl stroke stroke stroke
Levels: ctrl stroke
dds <- DESeqDataSetFromTximport(txi, sampleT, ~group)
dds
class: DESeqDataSet
dim: 60591 6
metadata(1): version
assays(2): counts avgTxLength
rownames(60591): A1BG A1BG-AS1 ... ZYXP1 ZZEF1
rowData names(0):
colnames(6): ctrl1 ctrl2 ... stroke2 stroke3
colData names(1): group
log2 fold change (MLE): group stroke vs ctrl
Wald test p-value: group stroke vs ctrl
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue
<numeric> <numeric> <numeric> <numeric> <numeric>
A1BG 138.9053 0.5696487 0.366932 1.5524628 0.12055154
A1BG-AS1 94.0295 -0.0194117 0.329088 -0.0589862 0.95296312
A1CF 12.2568 -0.0143727 0.875907 -0.0164089 0.98690817
A2M 175.9281 -1.5962724 0.612288 -2.6070619 0.00913229
A2M-AS1 70.6930 -0.9688552 0.494190 -1.9604894 0.04993861
A2ML1 20.0849 -0.3049334 0.624204 -0.4885156 0.62518465
padj
<numeric>
A1BG 0.2522373
A1BG-AS1 0.9747968
A1CF 0.9929060
A2M 0.0325865
A2M-AS1 0.1296658
A2ML1 0.7703000
DEG = rep("NC", nrow(res))
DEG[(res$log2FoldChange > 0) & (res$padj < 0.05)] = "UP"
DEG[(res$log2FoldChange < 0) & (res$padj < 0.05)] = "DN"
res$DEG = DEG
summary(factor(DEG))
DN NC UP
4366 51576 4649
res$LogFDR <- -log10(res$padj)
p <- ggplot(data.frame(res), aes(x = log2FoldChange, y = LogFDR, color = DEG)) + xlim(-20, 20)
p <- p + theme_bw() + labs(x = "Log2 fold-change", y = "-log10(FDR)", title = "")
p <- p + geom_point(size = 0.1)
p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + scale_color_manual(values = c("UP" = "red", "DN" = "dodgerblue4", "NC" = "grey"))
p
需要注意的是,在tximport
流程中,每个样本的每个基因有不同的size Factor
:
sm = normalizationFactors(dds)
head(sm)
ctrl1 ctrl2 ctrl3 stroke1 stroke2 stroke3
A1BG 2.712610 2.421541 2.578612 0.4481615 0.5277385 0.2496212
A1BG-AS1 2.670685 2.355683 2.526540 0.3544908 0.5776463 0.3072326
A1CF 3.030272 2.952609 3.031442 0.1937348 0.6274793 0.3032886
A2M 2.452045 3.013408 2.593027 0.4594420 0.6771202 0.1677684
A2M-AS1 2.602384 2.435514 2.612423 0.3835203 0.5086496 0.3095903
A2ML1 2.280079 2.551780 2.102948 0.4340998 0.3336740 0.5642435
如果想查看样本水平的size Factor
:
nm <- assays(dds)[["avgTxLength"]]
sf <- estimateSizeFactorsForMatrix(counts(dds) / nm)
sf
ctrl1 ctrl2 ctrl3 stroke1 stroke2 stroke3
2.6208348 2.5460171 2.6137423 0.3837055 0.5411728 0.3090345
当实验中加入了spike-in
,我们也可以考虑使用spike-in
进行校正。假设spike-in的read count也在输入的矩阵中,我们可以使用如下流程进行分析。
tx = read.table(file = "featureCounts/featureCounts_table.txt", row.names = NULL, header = TRUE, sep = "\t")
counts = as.matrix(tx[,8:13])
colnames(counts) = gsub("Aligned", "", sapply(strsplit(colnames(counts), "\\."), function(z) z[2]))
rownames(counts) = paste0(tx$Geneid, ' ', tx$gene_name)
head(counts)
ctrl1 ctrl2 ctrl3 stroke1 stroke2
ENSG00000223972.5 DDX11L1 63 5 15 16 17
ENSG00000227232.5 WASH7P 306 352 268 69 181
ENSG00000278267.1 MIR6859-1 0 0 0 0 0
ENSG00000243485.5 MIR1302-2HG 0 0 0 0 0
ENSG00000284332.1 MIR1302-2 0 0 0 0 0
ENSG00000237613.2 FAM138A 0 0 0 0 0
stroke3
ENSG00000223972.5 DDX11L1 42
ENSG00000227232.5 WASH7P 89
ENSG00000278267.1 MIR6859-1 0
ENSG00000243485.5 MIR1302-2HG 0
ENSG00000284332.1 MIR1302-2 0
ENSG00000237613.2 FAM138A 0
构建对象:
dds2 <- DESeqDataSetFromMatrix(countData = counts,
colData = sampleT,
design = ~ group)
dds2
class: DESeqDataSet
dim: 60728 6
metadata(1): version
assays(1): counts
rownames(60728): ENSG00000223972.5 DDX11L1 ENSG00000227232.5
WASH7P ... ENSG00000278625.1 AC237676.1 ENSG00000277374.1
AC116622.1
rowData names(0):
colnames(6): ctrl1 ctrl2 ... stroke2 stroke3
colData names(1): group
为了演示的方便,我们从上面分析结果中随机选择2000个没有调变的基因作为spike-in基因:
# 随机选择2000个基因
spikein = sample(rownames(res)[res$DEG == "NC"], 2000)
# 计算size factor
idx = sapply(strsplit(rownames(dds2), " "), function(z) z[2]) %in% spikein
dds2 = estimateSizeFactors(dds2, controlGenes = idx)
sizeFactors(dds2)
ctrl1 ctrl2 ctrl3 stroke1 stroke2 stroke3
2.6814236 2.6833622 2.7274221 0.3451876 0.5137000 0.3081085
可以看出这个结果和前面流程算出来的结果非常类似。使用这个新的size Factor
进行差异表达分析:
log2 fold change (MLE): group stroke vs ctrl
Wald test p-value: group stroke vs ctrl
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE
<numeric> <numeric> <numeric>
ENSG00000223972.5 DDX11L1 41.1031 2.79453 0.821098
ENSG00000227232.5 WASH7P 197.4425 1.29793 0.297050
ENSG00000278267.1 MIR6859-1 0.0000 NA NA
ENSG00000243485.5 MIR1302-2HG 0.0000 NA NA
ENSG00000284332.1 MIR1302-2 0.0000 NA NA
ENSG00000237613.2 FAM138A 0.0000 NA NA
stat pvalue padj
<numeric> <numeric> <numeric>
ENSG00000223972.5 DDX11L1 3.40340 6.65525e-04 2.72921e-03
ENSG00000227232.5 WASH7P 4.36938 1.24598e-05 7.86367e-05
ENSG00000278267.1 MIR6859-1 NA NA NA
ENSG00000243485.5 MIR1302-2HG NA NA NA
ENSG00000284332.1 MIR1302-2 NA NA NA
ENSG00000237613.2 FAM138A NA NA NA
The Wald test检验的是log2 fold change是否为0,当有多个因素的时候,可以使用Likelihood ratio test (LRT)。为了演示的方便,我们假设测试数据来自于2个批次:
sampleT <- data.frame(sampleT, batch = factor(rep(c("first", "second"), 3)))
sampleT
group batch
ctrl1 ctrl first
ctrl2 ctrl second
ctrl3 ctrl first
stroke1 stroke second
stroke2 stroke first
stroke3 stroke second
dds3 <- DESeqDataSetFromTximport(txi, sampleT, ~ batch + group)
dds3 <- DESeq(dds3, test="LRT", reduced= ~ batch)
res3 <- results(dds3)
head(res3)
log2 fold change (MLE): group stroke vs ctrl
LRT p-value: '~ batch + group' vs '~ batch'
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue
<numeric> <numeric> <numeric> <numeric> <numeric>
A1BG 138.9053 0.3788450 0.332901 1.30135719 0.2539655
A1BG-AS1 94.0295 -0.1443174 0.335470 0.18675073 0.6656348
A1CF 12.2568 -0.0986076 1.033360 0.00821582 0.9277777
A2M 175.9281 -1.4112335 0.673109 3.33778411 0.0677057
A2M-AS1 70.6930 -0.7652698 0.515541 2.15078246 0.1424973
A2ML1 20.0849 -0.3870942 0.747196 0.26467387 0.6069272
padj
<numeric>
A1BG 0.448675
A1BG-AS1 0.808947
A1CF 0.964256
A2M 0.172948
A2M-AS1 0.301514
A2ML1 0.768356
DEG = rep("NC", nrow(res3))
DEG[(res3$log2FoldChange > 0) & (res3$padj < 0.05)] = "UP"
DEG[(res3$log2FoldChange < 0) & (res3$padj < 0.05)] = "DN"
res3$DEG = DEG
table(res$DEG, res3$DEG)
DN NC UP
DN 3740 626 0
NC 101 51409 66
UP 0 925 3724
可以看出两种结果的重合度非常高。
我们可以使用EnhancedVolcano以及pheatmap来展示分析的结果。
write.table(res3, file = "DESeq2_res.txt", row.names = TRUE, col.names=TRUE, sep = "\t", quote = FALSE)