The workflow is written in WDL (The Workflow Description Language, WDL) which could run on local Linux cluster as well as on the cloud. To run this RNA-seq workflow, a JSON file must be provided. Take one sample as an example, a JSON file look like:
{
"RNAseq.masterFolder":"/sibcb1/bioinformatics/dataupload/MM_rainbow_GSE121007/workflow",
"RNAseq.output_basename":"MM1S_M1_BM",
"RNAseq.read1":"/sibcb1/bioinformatics/dataupload/MM_rainbow_GSE121007/Fastq/MM1S_M1_BM_R1.fastq.gz",
"RNAseq.read2":"/sibcb1/bioinformatics/dataupload/MM_rainbow_GSE121007/Fastq/MM1S_M1_BM_R2.fastq.gz",
"RNAseq.genomeIndexSTAR":"/sibcb2/bioinformatics/iGenome/STAR/GENCODE/human_hg38/STAR_Idx",
"RNAseq.gtf_file":"/sibcb2/bioinformatics/iGenome/STAR/GENCODE/human_hg38/raw/gencode.v34.primary_assembly.annotation.gtf",
"RNAseq.IndexKallisto":"/sibcb2/bioinformatics/iGenome/Kallisto/GENECODE_hg38/transcriptome.idx",
"RNAseq.thread":"8",
"RNAseq.collapsed_gtf":"/sibcb2/bioinformatics/iGenome/STAR/GENCODE/human_hg38/RNA-SeQC/gencode.v34_collapsed.gtf"
}
To process this sample, run the command like this:
java -Xmx1g -jar /sibcb2/bioinformatics/software/cromwell/cromwell-50.jar \
\
run wdl/RNAseq.wdl --inputs JSON/MM1S_M1_BM.json
The results are organized in a single folder:
/MM1S_M1_BM
workflow|-- STAR
| |-- MM1S_M1_BM.bam
| |-- MM1S_M1_BM.bam.bai
| |-- MM1S_M1_BM_STAR_Log.txt
| `-- MM1S_M1_BM_STAR_QC.txt
|-- bamCoverage
| `-- MM1S_M1_BM.bw
|-- kallisto
| |-- MM1S_M1_BM.bam
| |-- MM1S_M1_BM.bam.bai
| |-- MM1S_M1_BM.h5
| |-- MM1S_M1_BM.tsv
| `-- run_info.json
-- report
`|-- MM1S_M1_BM.bam.metrics.tsv
|-- MM1S_M1_BM_R1_fastqc.html
|-- MM1S_M1_BM_R1_fastqc.zip
|-- MM1S_M1_BM_R2_fastqc.html
-- MM1S_M1_BM_R2_fastqc.zip `
suppressWarnings(library(tximport))
args <- list(masterFolder = "/sibcb1/bioinformatics/dataupload/MM_rainbow_GSE121007/workflow",
tx2g = "/sibcb2/bioinformatics/iGenome/STAR/GENCODE/human_hg38/ID/tx2g.txt",
studyTag = "rainbowMM",
outFolder = "DataSummary")
if(!dir.exists(args$outFolder))
dir.create(args$outFolder)
SRX = list.files(args$masterFolder)
Ns = length(SRX)
SRX
[1] "M361_BM" "M361_CTC" "M361_TM" "M364_BM"
[5] "M364_CTC" "M364_TM" "M368_BM" "M368_CTC"
[9] "M368_TM" "M962_BM_LS" "M962_BM_RS" "M962_T_LS"
[13] "M962_T_RS" "M964_BM_LS" "M964_BM_RS" "M964_T_LS"
[17] "M964_T_RS" "MM1S_Luc_GFP" "MM1S_M1_BM" "MM1S_M1_TM"
[21] "MM1S_M2_BM" "MM1S_M2_TM" "MM1S_M3_BM" "MM1S_M3_TM"
[25] "OPM2"
# load kallisto results
files = paste0(args$masterFolder, "/", SRX, "/kallisto/", SRX, ".tsv")
names(files) = SRX
tx2gene = read.table(file = args$tx2g, row.names = NULL, header = TRUE, sep = "\t")
txi <- tximport(files, type = "kallisto", tx2gene = tx2gene, ignoreAfterBar = FALSE)
# load gene annotation
geneString = unique(paste(tx2gene$ENSG, tx2gene$Symbol))
geneAnn = data.frame(geneString)
rownames(geneAnn) = sapply(strsplit(geneString, " "), function(z) z[1])
findType = function(z) paste0(sort(unique(z)), collapse = " ")
aggT = aggregate(tx2gene$mType, by = list(tx2gene$ENSG),FUN="findType")
rownames(aggT) = as.character(aggT[,1])
geneAnn$mType = aggT[rownames(geneAnn),2]
geneAnn = geneAnn[rownames(txi$abundance), ]
names(txi)
[1] "abundance" "counts" "length"
[4] "countsFromAbundance"
head(txi$abundance[,1:5])
M361_BM M361_CTC M361_TM M364_BM
ENSG00000000003.15 0.4141947 0.01819987 0.04689216 0.06345463
ENSG00000000005.6 0.0000000 0.00000000 0.00000000 0.00000000
ENSG00000000419.12 0.3432125 451.24746500 359.87493200 83.78215960
ENSG00000000457.14 0.7275770 8.13217600 14.55484000 3.11982400
ENSG00000000460.17 0.0194113 9.34187100 20.08569700 6.26314400
ENSG00000000938.13 0.0000000 5.62875800 6.55266300 0.53106100
M364_CTC
ENSG00000000003.15 0.0483922
ENSG00000000005.6 0.0000000
ENSG00000000419.12 296.1021700
ENSG00000000457.14 5.0868320
ENSG00000000460.17 14.1199380
ENSG00000000938.13 3.9722840
head(geneAnn)
geneString
ENSG00000000003.15 ENSG00000000003.15 TSPAN6
ENSG00000000005.6 ENSG00000000005.6 TNMD
ENSG00000000419.12 ENSG00000000419.12 DPM1
ENSG00000000457.14 ENSG00000000457.14 SCYL3
ENSG00000000460.17 ENSG00000000460.17 C1orf112
ENSG00000000938.13 ENSG00000000938.13 FGR
mType
ENSG00000000003.15 processed_transcript protein_coding
ENSG00000000005.6 processed_transcript protein_coding
ENSG00000000419.12 processed_transcript protein_coding
ENSG00000000457.14 processed_transcript protein_coding
ENSG00000000460.17 nonsense_mediated_decay processed_transcript protein_coding
ENSG00000000938.13 processed_transcript protein_coding
We wanted to compare the gene expression of cells in metastatic sites (host BM) and those in primary site. The sample annotation:
sampleT = read.table(file = "samplesheet/MM1S.txt", row.names = 1, header = TRUE, sep = "\t")
sampleT
rm(list = ls())
suppressWarnings(library(DESeq2))
suppressWarnings(library(ggplot2))
args = list(rawRdFile = "DataSummary/rainbowMM_raw.RData",
samplesheet = "samplesheet/MM1S.txt",
studyTag = "rainbowMM",
variable = "Group",
treatment = "BM",
control = "Tumor",
absLog2FC = 0,
FDR = 0.05,
outFolder = "DGE")
if(!dir.exists(args$outFolder))
dir.create(args$outFolder)
Run DGE:
load(args$rawRdFile)
assign("raw", get(paste0(args$studyTag, "_raw")))
txi = raw$txi
sampleT = read.table(file = args$samplesheet, row.names = 1, header = TRUE, sep = "\t")
kSamples = intersect(colnames(txi$counts), rownames(sampleT))
sampleTable = sampleT[kSamples, , drop = FALSE]
newTxi = list(abundance = txi$abundance[, kSamples],
counts = txi$counts[, kSamples],
length = txi$length[, kSamples],
countsFromAbundance = "no")
dds <- DESeqDataSetFromTximport(newTxi, colData = sampleTable, design = ~ Group)
dds <- DESeq(dds)
res <- results(dds, contrast=c(args$variable, args$treatment, args$control))
rTable <- data.frame(res)
gene <- raw$geneAnn[rownames(rTable), "geneString"]
s1 <- rownames(sampleTable)[sampleTable[,args$variable] == args$treatment]
s2 <- rownames(sampleTable)[sampleTable[,args$variable] == args$control]
treatment_TPM <- signif(apply(txi$abundance[rownames(rTable), s1], 1, mean), 4)
control_TPM <- signif(apply(txi$abundance[rownames(rTable), s2], 1, mean), 4)
DGE <- rep("NC", nrow(rTable))
DGE[((rTable$padj) < args$FDR) & (rTable$log2FoldChange > args$absLog2FC)] = "UP"
DGE[((rTable$padj) < args$FDR) & (rTable$log2FoldChange < -args$absLog2FC)] = "DN"
rTable = data.frame(gene, treatment_TPM, control_TPM, rTable[, c("log2FoldChange", "pvalue", "padj")], DGE)
# write.table(rTable, file = paste0(args$outFolder, "/", args$studyTag, "_MM1S_", args$treatment, "_vs_", args$control,"_DESeq2.txt"), row.names = FALSE, col.names = TRUE, sep = "\t")
head(rTable[order(rTable$pvalue),])
gene treatment_TPM control_TPM
ENSG00000275302.2 ENSG00000275302.2 CCL4 1.8210 353.60
ENSG00000153234.14 ENSG00000153234.14 NR4A2 0.4639 73.55
ENSG00000170345.10 ENSG00000170345.10 FOS 2.1990 266.60
ENSG00000276070.5 ENSG00000276070.5 CCL4L2 9.6350 808.00
ENSG00000130522.6 ENSG00000130522.6 JUND 9.3740 205.90
ENSG00000120129.6 ENSG00000120129.6 DUSP1 3.6160 125.60
log2FoldChange pvalue padj DGE
ENSG00000275302.2 -7.536786 2.164787e-214 3.800500e-210 DN
ENSG00000153234.14 -7.234241 1.173668e-209 1.030246e-205 DN
ENSG00000170345.10 -6.792612 1.825140e-184 1.068072e-180 DN
ENSG00000276070.5 -6.277381 1.582486e-182 6.945529e-179 DN
ENSG00000130522.6 -4.369047 3.971796e-118 1.394577e-114 DN
ENSG00000120129.6 -4.996763 2.118665e-106 6.199213e-103 DN
options(repr.plot.width=4, repr.plot.height=3, repr.plot.res = 200)
rTable$LogFDR = -log10(rTable$padj)
p <- ggplot(rTable, aes(x = log2FoldChange, y = LogFDR, color = DGE)) + xlim(-10,10) + ylim(0, 50)
p <- p + theme_bw() + labs(x = "Log2 fold-change", y = "-log10(FDR)", title = "")
p <- p + geom_point(size = 0.1) + theme(plot.title = element_text(hjust = 0.5))
p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + scale_color_manual(values = c("UP" = "red", "DN" = "dodgerblue4", "NC" = "grey"))
p <- p + theme(legend.position="none")
print(p)
There are huge number of genes that are differentially expressed between metastatic samples and primary samples in mouse rainbow model. Here we have chosen top genes as signature and tested its association with clinical data.
rm(list = ls())
options(stringsAsFactors = FALSE)
fileIn <- "DGE/rainbowMM_MM1S_BM_vs_Tumor_DESeq2.txt"
Tx = read.table(file = fileIn, row.names = NULL, header = TRUE, sep = "\t")
head(Tx)
gene treatment_TPM control_TPM
1 ENSG00000000003.15 TSPAN6 0.06986 0.04571
2 ENSG00000000005.6 TNMD 0.00000 0.00000
3 ENSG00000000419.12 DPM1 91.55000 97.88000
4 ENSG00000000457.14 SCYL3 9.10100 10.44000
5 ENSG00000000460.17 C1orf112 12.27000 15.41000
6 ENSG00000000938.13 FGR 0.75280 0.19560
log2FoldChange pvalue padj DGE
1 0.722823540 0.4009026582 0.644096193 NC
2 NA NA NA NC
3 -0.002166037 0.9872822644 0.994761676 NC
4 -0.122758867 0.4831839672 0.713463554 NC
5 -0.236411489 0.1049239413 0.289557836 NC
6 2.076257775 0.0002120523 0.002733326 UP
Idx1 = (Tx$treatment_TPM > 1) | (Tx$control_TPM > 1)
Idx2 = !is.na(Tx$padj)
Tx = Tx[Idx1 & Idx2, ]
score <- -log10(Tx$padj)*sign(Tx$log2FoldChange)
names(score) <- Tx$gene
UP <- unique(sapply(strsplit(names(sort(score, decreasing = TRUE)[1:300]), " "), function(z) z[2]))
DN <- unique(sapply(strsplit(names(sort(score, decreasing = FALSE)[1:300]), " "), function(z) z[2]))
PG <- unique(sapply(strsplit(Tx$gene, " "), function(z) z[2]))
sigList <- list(MM1S = list(UP = UP, DN = DN, PG = PG))
# save(sigList, file = "RData/rainbowMM_DGE_top.RData")