Identification of dissemination signature in multiple myeloma

RNA-seq workflow

The workflow is written in WDL (The Workflow Description Language, WDL) which could run on local Linux cluster as well as on the cloud. To run this RNA-seq workflow, a JSON file must be provided. Take one sample as an example, a JSON file look like:

{
"RNAseq.masterFolder":"/sibcb1/bioinformatics/dataupload/MM_rainbow_GSE121007/workflow",
"RNAseq.output_basename":"MM1S_M1_BM",
"RNAseq.read1":"/sibcb1/bioinformatics/dataupload/MM_rainbow_GSE121007/Fastq/MM1S_M1_BM_R1.fastq.gz",
"RNAseq.read2":"/sibcb1/bioinformatics/dataupload/MM_rainbow_GSE121007/Fastq/MM1S_M1_BM_R2.fastq.gz",
"RNAseq.genomeIndexSTAR":"/sibcb2/bioinformatics/iGenome/STAR/GENCODE/human_hg38/STAR_Idx",
"RNAseq.gtf_file":"/sibcb2/bioinformatics/iGenome/STAR/GENCODE/human_hg38/raw/gencode.v34.primary_assembly.annotation.gtf",
"RNAseq.IndexKallisto":"/sibcb2/bioinformatics/iGenome/Kallisto/GENECODE_hg38/transcriptome.idx",
"RNAseq.thread":"8",
"RNAseq.collapsed_gtf":"/sibcb2/bioinformatics/iGenome/STAR/GENCODE/human_hg38/RNA-SeQC/gencode.v34_collapsed.gtf"
}

To process this sample, run the command like this:

java -Xmx1g -jar /sibcb2/bioinformatics/software/cromwell/cromwell-50.jar \
    run wdl/RNAseq.wdl \
    --inputs JSON/MM1S_M1_BM.json

The results are organized in a single folder:

    workflow/MM1S_M1_BM
    |-- STAR
    |   |-- MM1S_M1_BM.bam
    |   |-- MM1S_M1_BM.bam.bai
    |   |-- MM1S_M1_BM_STAR_Log.txt
    |   `-- MM1S_M1_BM_STAR_QC.txt
    |-- bamCoverage
    |   `-- MM1S_M1_BM.bw
    |-- kallisto
    |   |-- MM1S_M1_BM.bam
    |   |-- MM1S_M1_BM.bam.bai
    |   |-- MM1S_M1_BM.h5
    |   |-- MM1S_M1_BM.tsv
    |   `-- run_info.json
    `-- report
        |-- MM1S_M1_BM.bam.metrics.tsv
        |-- MM1S_M1_BM_R1_fastqc.html
        |-- MM1S_M1_BM_R1_fastqc.zip
        |-- MM1S_M1_BM_R2_fastqc.html
        `-- MM1S_M1_BM_R2_fastqc.zip

RNA-seq Data summary

suppressWarnings(library(tximport))
args <- list(masterFolder = "/sibcb1/bioinformatics/dataupload/MM_rainbow_GSE121007/workflow",
        tx2g = "/sibcb2/bioinformatics/iGenome/STAR/GENCODE/human_hg38/ID/tx2g.txt",
        studyTag = "rainbowMM",
        outFolder = "DataSummary")

if(!dir.exists(args$outFolder))
  dir.create(args$outFolder)

SRX = list.files(args$masterFolder)
Ns = length(SRX)
SRX
 [1] "M361_BM"      "M361_CTC"     "M361_TM"      "M364_BM"     
 [5] "M364_CTC"     "M364_TM"      "M368_BM"      "M368_CTC"    
 [9] "M368_TM"      "M962_BM_LS"   "M962_BM_RS"   "M962_T_LS"   
[13] "M962_T_RS"    "M964_BM_LS"   "M964_BM_RS"   "M964_T_LS"   
[17] "M964_T_RS"    "MM1S_Luc_GFP" "MM1S_M1_BM"   "MM1S_M1_TM"  
[21] "MM1S_M2_BM"   "MM1S_M2_TM"   "MM1S_M3_BM"   "MM1S_M3_TM"  
[25] "OPM2"        
# load kallisto results
files = paste0(args$masterFolder, "/", SRX, "/kallisto/", SRX, ".tsv")
names(files) = SRX
tx2gene = read.table(file = args$tx2g, row.names = NULL, header = TRUE, sep = "\t")
txi <- tximport(files, type = "kallisto", tx2gene = tx2gene, ignoreAfterBar = FALSE)

# load gene annotation
geneString = unique(paste(tx2gene$ENSG, tx2gene$Symbol))
geneAnn = data.frame(geneString)
rownames(geneAnn) = sapply(strsplit(geneString, " "), function(z) z[1])

findType = function(z) paste0(sort(unique(z)), collapse = " ")
aggT = aggregate(tx2gene$mType, by = list(tx2gene$ENSG),FUN="findType")
rownames(aggT) = as.character(aggT[,1])
geneAnn$mType = aggT[rownames(geneAnn),2]
geneAnn = geneAnn[rownames(txi$abundance), ]
names(txi)
[1] "abundance"           "counts"              "length"             
[4] "countsFromAbundance"
head(txi$abundance[,1:5])
                     M361_BM     M361_CTC      M361_TM     M364_BM
ENSG00000000003.15 0.4141947   0.01819987   0.04689216  0.06345463
ENSG00000000005.6  0.0000000   0.00000000   0.00000000  0.00000000
ENSG00000000419.12 0.3432125 451.24746500 359.87493200 83.78215960
ENSG00000000457.14 0.7275770   8.13217600  14.55484000  3.11982400
ENSG00000000460.17 0.0194113   9.34187100  20.08569700  6.26314400
ENSG00000000938.13 0.0000000   5.62875800   6.55266300  0.53106100
                      M364_CTC
ENSG00000000003.15   0.0483922
ENSG00000000005.6    0.0000000
ENSG00000000419.12 296.1021700
ENSG00000000457.14   5.0868320
ENSG00000000460.17  14.1199380
ENSG00000000938.13   3.9722840
head(geneAnn)
                                    geneString
ENSG00000000003.15   ENSG00000000003.15 TSPAN6
ENSG00000000005.6       ENSG00000000005.6 TNMD
ENSG00000000419.12     ENSG00000000419.12 DPM1
ENSG00000000457.14    ENSG00000000457.14 SCYL3
ENSG00000000460.17 ENSG00000000460.17 C1orf112
ENSG00000000938.13      ENSG00000000938.13 FGR
                                                                         mType
ENSG00000000003.15                         processed_transcript protein_coding
ENSG00000000005.6                          processed_transcript protein_coding
ENSG00000000419.12                         processed_transcript protein_coding
ENSG00000000457.14                         processed_transcript protein_coding
ENSG00000000460.17 nonsense_mediated_decay processed_transcript protein_coding
ENSG00000000938.13                         processed_transcript protein_coding
outName = paste0(args$studyTag, "_raw")
assign(outName, list(geneAnn = geneAnn, txi = txi))
# save(list = outName, file = paste0(args$outFolder, "/", outName, ".RData"))

Differential gene expression

We wanted to compare the gene expression of cells in metastatic sites (host BM) and those in primary site. The sample annotation:

sampleT = read.table(file = "samplesheet/MM1S.txt", row.names = 1, header = TRUE, sep = "\t")
sampleT
rm(list = ls())
suppressWarnings(library(DESeq2))
suppressWarnings(library(ggplot2))
args = list(rawRdFile = "DataSummary/rainbowMM_raw.RData",
      samplesheet = "samplesheet/MM1S.txt",
      studyTag = "rainbowMM",
      variable = "Group",
      treatment = "BM",
      control = "Tumor",
      absLog2FC = 0,
      FDR = 0.05,
      outFolder = "DGE")
if(!dir.exists(args$outFolder))
  dir.create(args$outFolder)

Run DGE:

load(args$rawRdFile)
assign("raw", get(paste0(args$studyTag, "_raw")))
txi = raw$txi

sampleT = read.table(file = args$samplesheet, row.names = 1, header = TRUE, sep = "\t")
kSamples = intersect(colnames(txi$counts), rownames(sampleT))
sampleTable = sampleT[kSamples, , drop = FALSE]

newTxi = list(abundance = txi$abundance[, kSamples],
        counts = txi$counts[, kSamples],
        length = txi$length[, kSamples],
        countsFromAbundance = "no")
dds <- DESeqDataSetFromTximport(newTxi, colData = sampleTable, design = ~ Group)
dds <- DESeq(dds)
res <- results(dds, contrast=c(args$variable, args$treatment, args$control))

rTable <- data.frame(res)
gene <- raw$geneAnn[rownames(rTable), "geneString"]
s1 <- rownames(sampleTable)[sampleTable[,args$variable] == args$treatment]
s2 <- rownames(sampleTable)[sampleTable[,args$variable] == args$control]
treatment_TPM <- signif(apply(txi$abundance[rownames(rTable), s1], 1, mean), 4)
control_TPM <- signif(apply(txi$abundance[rownames(rTable), s2], 1, mean), 4)
DGE <- rep("NC", nrow(rTable))
DGE[((rTable$padj) < args$FDR) & (rTable$log2FoldChange >  args$absLog2FC)] = "UP"
DGE[((rTable$padj) < args$FDR) & (rTable$log2FoldChange < -args$absLog2FC)] = "DN"
rTable = data.frame(gene, treatment_TPM, control_TPM, rTable[, c("log2FoldChange", "pvalue", "padj")], DGE)
# write.table(rTable, file = paste0(args$outFolder, "/", args$studyTag, "_MM1S_", args$treatment, "_vs_", args$control,"_DESeq2.txt"), row.names = FALSE, col.names = TRUE, sep = "\t")
head(rTable[order(rTable$pvalue),])
                                       gene treatment_TPM control_TPM
ENSG00000275302.2    ENSG00000275302.2 CCL4        1.8210      353.60
ENSG00000153234.14 ENSG00000153234.14 NR4A2        0.4639       73.55
ENSG00000170345.10   ENSG00000170345.10 FOS        2.1990      266.60
ENSG00000276070.5  ENSG00000276070.5 CCL4L2        9.6350      808.00
ENSG00000130522.6    ENSG00000130522.6 JUND        9.3740      205.90
ENSG00000120129.6   ENSG00000120129.6 DUSP1        3.6160      125.60
                   log2FoldChange        pvalue          padj DGE
ENSG00000275302.2       -7.536786 2.164787e-214 3.800500e-210  DN
ENSG00000153234.14      -7.234241 1.173668e-209 1.030246e-205  DN
ENSG00000170345.10      -6.792612 1.825140e-184 1.068072e-180  DN
ENSG00000276070.5       -6.277381 1.582486e-182 6.945529e-179  DN
ENSG00000130522.6       -4.369047 3.971796e-118 1.394577e-114  DN
ENSG00000120129.6       -4.996763 2.118665e-106 6.199213e-103  DN
volcano plot:
  options(repr.plot.width=4, repr.plot.height=3, repr.plot.res = 200)
    rTable$LogFDR = -log10(rTable$padj)
    p <- ggplot(rTable, aes(x = log2FoldChange, y = LogFDR, color = DGE)) + xlim(-10,10) + ylim(0, 50)
    p <- p + theme_bw() + labs(x = "Log2 fold-change", y = "-log10(FDR)", title = "")
    p <- p + geom_point(size = 0.1) + theme(plot.title = element_text(hjust = 0.5))
    p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
    p <- p + scale_color_manual(values = c("UP" = "red", "DN" = "dodgerblue4", "NC" = "grey"))
    p <- p + theme(legend.position="none")
    print(p)

Dissemination signature

There are huge number of genes that are differentially expressed between metastatic samples and primary samples in mouse rainbow model. Here we have chosen top genes as signature and tested its association with clinical data.

rm(list = ls())
options(stringsAsFactors = FALSE)
fileIn <- "DGE/rainbowMM_MM1S_BM_vs_Tumor_DESeq2.txt"
Tx = read.table(file = fileIn, row.names = NULL, header = TRUE, sep = "\t")
head(Tx)
                         gene treatment_TPM control_TPM
1   ENSG00000000003.15 TSPAN6       0.06986     0.04571
2      ENSG00000000005.6 TNMD       0.00000     0.00000
3     ENSG00000000419.12 DPM1      91.55000    97.88000
4    ENSG00000000457.14 SCYL3       9.10100    10.44000
5 ENSG00000000460.17 C1orf112      12.27000    15.41000
6      ENSG00000000938.13 FGR       0.75280     0.19560
  log2FoldChange       pvalue        padj DGE
1    0.722823540 0.4009026582 0.644096193  NC
2             NA           NA          NA  NC
3   -0.002166037 0.9872822644 0.994761676  NC
4   -0.122758867 0.4831839672 0.713463554  NC
5   -0.236411489 0.1049239413 0.289557836  NC
6    2.076257775 0.0002120523 0.002733326  UP
Idx1 = (Tx$treatment_TPM > 1) | (Tx$control_TPM > 1)
Idx2 = !is.na(Tx$padj)
Tx = Tx[Idx1 & Idx2, ]
score <- -log10(Tx$padj)*sign(Tx$log2FoldChange)
names(score) <- Tx$gene

UP <- unique(sapply(strsplit(names(sort(score, decreasing = TRUE)[1:300]), " "), function(z) z[2]))
DN <- unique(sapply(strsplit(names(sort(score, decreasing = FALSE)[1:300]), " "), function(z) z[2]))
PG <- unique(sapply(strsplit(Tx$gene, " "), function(z) z[2]))

sigList <- list(MM1S = list(UP = UP, DN = DN, PG = PG))
# save(sigList, file = "RData/rainbowMM_DGE_top.RData")