The mHap map of common cancers: Characterizing DNA methylation inter-tumor heterogeneity in cancers

Figure6. Characterization of DNA methylation inter-tumoral heterogeneity in cancers. (A) Boxplots showing Pearson correlation coefficients of average methylation between pairwise CpGs in TCGA 450K array data. Regions analyzed: pan-cancer MHBs (all cancer types), cancer-type-specific MHBs, CGIs, and random genomic regions.(B) Quantification of concordant methylation levels. Using COAD as an example, LOESS curves were fitted between mean methylation and variance for MHBs versus random regions. The AUC quantifies concordance (lower AUC = higher concordance).(C) Number of DEGs between tumors with lowest/ highest AUCs. Red: upregulated; blue: downregulated. (D) Enriched pathways in upregulated DEGs, hallmark pathways from MSigDB. (E) Volcano plot associating driver mutations with concordant methylation levels (measured by AUC). Statistical significance was calculated by student’s two-sided t-test (FDR < 0.05). Red: AUC increased; blue: AUC decreased. (F) Boxplots showing IDH1 mutation significantly associated with methylation concordance in TCGA dataset.(G) Independent validation of IDH1 mutation effect (GSE50774). Mean methylation-variance plots compare IDH1-mutant (n = 13) and wild-type (n = 13) samples.

loading required packages:

library(GenomicRanges)
library(regioneR)
library(tidyverse)
library(stringr)
library(data.table)
library(regioneR)
library(ggplot2)
library(rGREAT)
library(clusterProfiler)
library(msigdbr)
library(ggthemes)
library(RColorBrewer)
library(limma)

Figure 6A

# 1. Load data
# Calculate the methylation correlation of HM450K probes from Random, CGI, and MHB regions
load("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig6/MHB_correlation/RnBeads_450K_hg19_Probes_GR.RData")
RnBeads_450K_hg19_GR = RnBeads_450K_hg19_GR[RnBeads_450K_hg19_GR$QC]

sourceFolder = "/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/TCGA_DNAm"
# CpG correlation
ArrayIMCorrelation <-function(MM,IntervalGR,cpgGR,Tag){
  # check seqlevels
  if(length(intersect(seqlevels(IntervalGR), seqlevels(cpgGR))) == 0)
    stop("No shared seqlevels found.\n")
  # only keep CpGs that locate in intervals
  Idx1 = countOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE) > 0
  Idx2 = names(cpgGR) %in% rownames(MM)
  Idx  = Idx1 & Idx2
  if(sum(Idx) == 0)
    stop("No CpGs found in pre-defined regions.\n")
  cpgGR = cpgGR[Idx, ]
  # keep shared CpGs
  cNames = intersect(rownames(MM), names(cpgGR))
  cpgGR  = cpgGR[cNames]
  MM    = MM[cNames, ]
  # Only select the tumor samples
  MM = MM[,!grepl("11",substr(colnames(MM),14,15))]
  # Sample-by-sample processing to control memory
  mergedM = matrix(NA, length(IntervalGR), 1)
  x = findOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE)
  cM = list()
  for (i in unique(subjectHits(x)) ){
    a  = which(subjectHits(x) == i)
    tag = paste0("S",i)
    if (length(a)<2){
      cM[[tag]] = NA
    }else if (nrow(na.omit(MM[a,]))<2) {
      cM[[tag]] = NA
    } else {
      b  = as.vector(cor(t(na.omit(MM[a,]))))
      cM[[tag]]  = median(b[b!=1])
    }
  }
  aT = do.call(rbind,cM)
  aID = as.character(gsub("S","",rownames(aT)))
  aMM = as.matrix(aT[,1])
  mergedM[as.integer(aID), ] = aMM
  colnames(mergedM) = Tag
  ## return
  return(mergedM)
}

###############################################
# 1. random region : create by bedtools "shuffle" keep cpg density and region length compare with cancer MHB
###############################################
Random_regions = toGRanges("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig6/regions/MHB_Callable_random.bed")

vFiles = list.files(sourceFolder)
Ns = length(vFiles)
Random_List = list()

for(i in 1:Ns){
  Tag = gsub(".RData", "", vFiles[i])
  # cat("Processing", Tag, "\n")
  fileIn = paste0(sourceFolder, "/", vFiles[i])
  load(fileIn)
  assign("MM", get(Tag))
  rm(list = Tag)
  Random_List[[i]] = ArrayIMCorrelation(MM, Random_regions, RnBeads_450K_hg19_GR,Tag)
}
names(Random_List) = gsub(".RData", "", vFiles)
Random_List = do.call(cbind,Random_List)
mcols(Random_regions) = Random_List
save(Random_List,Random_regions,file="TCGA_Random_ArrayCorrelation.RData")

##################################################
# 2. CGI regions 
#################################################
CGI = toGRanges("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig6/regions/hg19_cpgIsland.bed")

vFiles = list.files(sourceFolder)
Ns = length(vFiles)
CGI_List = list()

for(i in 1:Ns){
  Tag = gsub(".RData", "", vFiles[i])
  # cat("Processing", Tag, "\n")
  fileIn = paste0(sourceFolder, "/", vFiles[i])
  load(fileIn)
  assign("MM", get(Tag))
  rm(list = Tag)
  CGI_List[[i]] = ArrayIMCorrelation(MM, CGI, RnBeads_450K_hg19_GR,Tag)
}
names(CGI_List) = gsub(".RData", "", vFiles)
CGI_List = do.call(cbind,CGI_List)
mcols(CGI) = CGI_List
save(CGI_List,CGI,file="TCGA_CGI_ArrayCorrelation.RData")

####################################################
## 3. Cancer-specific MHBs regions
###################################################
path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor"
files = list.files(path)

cTList=list()
for (cT in files ){
  # Read Cancer MHB
  Cancer_MHB = toGRanges(paste0(path,"/",cT))
  Ns = length(vFiles)
  cList = list()
  for(i in 1:Ns){
      Tag = gsub(".RData", "", vFiles[i])
      # cat("Processing", Tag, "\n")
      fileIn = paste0(sourceFolder, "/", vFiles[i])
      load(fileIn)
      assign("MM", get(Tag))
      rm(list = Tag)
      cList[[i]] = ArrayIMCorrelation(MM, Cancer_MHB, RnBeads_450K_hg19_GR,Tag)
    }
    names(cList) = gsub(".RData", "", vFiles)
    mcols(Cancer_MHB)= do.call(cbind,cList)
    cTList[[cT]] = Cancer_MHB
  }
names(cTList) = files %>% str_split_i("[_.]",i=2)
## save
save(cTList,file="TCGA_Cancer_specific_MHB_ArrayCorrelation.RData")

##################################################
# 4. 81567 MHB region 
#################################################
tMHB = toGRanges("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed.gz")

vFiles = list.files(sourceFolder)
Ns = length(vFiles)
tMHB_List = list()

for(i in 1:Ns){
  Tag = gsub(".RData", "", vFiles[i])
  # cat("Processing", Tag, "\n")
  fileIn = paste0(sourceFolder, "/", vFiles[i])
  load(fileIn)
  assign("MM", get(Tag))
  rm(list = Tag)
  tMHB_List[[i]] = ArrayIMCorrelation(MM, tMHB, RnBeads_450K_hg19_GR,Tag)
}
names(tMHB_List) = gsub(".RData", "", vFiles)
tMHB_List = do.call(cbind,tMHB_List)
mcols(tMHB) = tMHB_List
save(tMHB_List,tMHB,file="TCGA_tMHB_ArrayCorrelation.RData")

###############################################
# 5. plot 
################################################

vFiles=names(cTList)
data = list()
for (i in vFiles){
  if ( i=="BRCA" ) {
    tag = i
  }else if(i=="CESC"){
    tag = i
  }else if(i=="COAD") {
    tag = c("COAD","COADREAD","READ")
  }else if(i=="ESCA"){
    tag = c("ESCA","STES")
  }else if(i=="HNSC"){
    tag = "HNSC"
  }else if( i=="LIHC"){
    tag = "LIHC"
  }else if(i=="NSCLC"){
    tag = c("LUAD","LUSC")
  }else if ( i=="OV"){
    tag = "OV"
  }else if (i=="PACA"){
    tag = "PAAD"
  }else if(i=="STAD"){
    tag = c("STAD","STES")
  }else if( i=="THCA") {
    tag = "THCA"
  }

  for (j in tag){
        ## Cancer-specific MHB
        x = na.omit(mcols(cTList[[i]])[j])
    x$Group = "MHB"
        x$Type = j
        colnames(x) <- c("res","Group","Type")
        ## union cancer-type MHB
        xx = na.omit(mcols(tMHB)[j])
        xx$Group = "tMHB"
        xx$Type = j
        colnames(xx) <- c("res","Group","Type")
        ## CGI
    y= na.omit(mcols(CGI)[j])
    y$Group = "CGI"
        y$Type = j
        colnames(y) <- c("res","Group","Type")
        ## ramdom
        z = na.omit(mcols(Random_regions)[j])
        z$Group = "Random"
        z$Type = j
        colnames(z) <- c("res","Group","Type")
    mat = as.data.frame(rbind(x,xx,y,z))
    data[[j]] = mat
    }
}
res_data <- do.call(rbind,data)
res_data$Group = factor(res_data$Group,levels = c("MHB","tMHB","CGI","Random"))
## boxplot
p<-ggplot(res_data, aes(x=Type, y=res, fill= Group )) + 
      geom_boxplot(outlier.shape = NA,position = position_dodge(width = 0.8)) +
      labs(x= "",y="Pearson's Correlation Coefficience (r)") +
      scale_fill_brewer(palette = "Set1")+
      theme_bw() +
      theme(panel.background = element_blank(),
          panel.border = element_blank(),
          panel.grid = element_blank(),
        axis.line=element_line(color="black"),
         axis.text.x = element_text(color="black",size=10,angle = 60,hjust = 1),
        axis.text.y = element_text(color="black",size=10),
        axis.ticks = element_line(color="black"),
        axis.ticks.length = unit(5, "pt"),
             legend.position="bottom") 
print(p)

Figure 6B

## random
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/AUC/TCGA_MHB_Call_able_Methylation_random_0421.RData")
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/AUC/TCGA_MHB_Call_able_MethylVariations_random_0421.RData")
rList_random <- MM
metVar_random <- metVar 

## COAD MHB
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/AUC/TCGA_MHB_Call_able_Methylation_0421.RData")
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/AUC/TCGA_MHB_Call_able_MethylVariations_0421.RData")
rList <- MM
metVar <- metVar

MHB = toGRanges("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/regions/MHB_Call_able.cpg.bed") %>%
     as.character()
MHB_random = toGRanges("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/regions/MHB_Callable_random.bed") %>%
 as.character()

## loess fitted
loessCurve <- function(dataFrame, xValue) {
  loessDf <- loess(var ~ mean, data = dataFrame, span =0.7,se = FALSE)  ###Random :span = 1
  yloessDfVals <-  c(0,loessDf$fitted,0)  ### var 
  #yloessDfVals[yloessDfVals < 0] <-  0
  xloessDfVals  <- c(0,loessDf$x,1)  ### mean
  loessDfAUC <- round(pracma::trapz(xloessDfVals, yloessDfVals), 3)
  yValue <- mean(yloessDfVals[which(round(xloessDfVals, 2) == xValue)])
  list(xloessDfVals = xloessDfVals, yloessDfVals = yloessDfVals, loessDfAUC = loessDfAUC, yValue = yValue)
}

######################################################
## test demo  MCB vs. Random
######################################################
tumor = "COAD"
## mcb
test_methy <- rList[[tumor]] %>% as.data.frame() %>% mutate(name=MHB) %>%
                pivot_longer(-c(name),names_to = "sample",values_to = "mean") 
test_var <- metVar[[tumor]] %>% as.data.frame() %>% mutate(name=MHB) %>%
                pivot_longer(-c(name),names_to = "sample",values_to = "var")

test_mcb <- test_methy %>% inner_join(test_var,by=c("name","sample")) %>% na.omit() 
set1 <- data.frame(MM=test_mcb$mean,var=test_mcb$var,type="mcb") %>% mutate(group = cut_interval(MM,length = 0.005)) %>% 
          group_by(group,type) %>% summarise(mean = mean(MM), var = mean(var))  %>% 
          ungroup() 

mcb_demo <- loessCurve(set1, 0.5) 
set11 <- data.frame(MM=mcb_demo$xloessDfVals,var=mcb_demo$yloessDfVals,type="mcb")
 
## random
test_methy_random <- rList_random[[tumor]] %>% as.data.frame() %>% mutate(name=MHB_random) %>%
                pivot_longer(-c(name),names_to = "sample",values_to = "mean")

test_var_random <- metVar_random[[tumor]]%>% as.data.frame() %>% mutate(name=MHB_random) %>%
                pivot_longer(-c(name),names_to = "sample",values_to = "var")
test_random <- test_methy_random %>% inner_join(test_var_random,by=c("name","sample")) %>% na.omit() 

set2 <-  data.frame(MM=test_random$mean,var=test_random$var,type="random")  %>% mutate(group = cut_interval(MM,length = 0.005)) %>% 
          group_by(group,type) %>% summarise(mean = mean(MM), var = mean(var))  %>% 
          ungroup() 
random_demo <-loessCurve(set2, 0.5)
set22 <-  data.frame(MM=random_demo$xloessDfVals,var=random_demo$yloessDfVals,type="random")

## merged and plot
dataset <- rbind(set11,set22) %>% rename_with(~c("MM","var","type")) 
p<-ggplot(dataset, aes(x=MM, y=var, color=type)) +
    geom_line(aes(color=type)) +
    scale_color_manual(values = c("mcb" = "firebrick", "random" = "steelblue")) +
    geom_area(aes(fill=type),alpha = 0.3,position="identity") +
    scale_fill_manual(values = c("mcb" = "firebrick", "random" = "steelblue")) +
    ggtitle("TCGA-COAD", subtitle = "Methylation & Variation") + 
    labs(x = "Methylation", y = "Variation") +
    scale_y_continuous(expand=c(0,0),limits = c(0,0.08)) +
    annotate("text",x=0.45,y=0.045,label=paste0("MCB AUC = ",round(mcb_demo$loessDfAUC,3)),size=3,color="firebrick") + 
    annotate("text",x=0.45,y=0.04,label=paste0("Random AUC = ",round(random_demo$loessDfAUC,3)),size=3,color="steelblue") +
    geom_segment(x = 0.5, xend = 0.5, y = 0, yend = random_demo$yValue,                  
               linetype = "dashed", color = "gray",size=0.5) +
    geom_segment(x = 0, xend = 0.5, y = random_demo$yValue, yend = random_demo$yValue,                  
               linetype = "dashed", color = "gray",size=0.5) +
    geom_segment(x = 0, xend = 0.5, y = mcb_demo$yValue, yend = mcb_demo$yValue,                  
               linetype = "dashed", color = "gray",size=0.5) +  
    geom_text(label = round(mcb_demo$yValue,3), x = 0, y = mcb_demo$yValue,
            vjust = 1.5, color = "firebrick", size = 3) +                  
    geom_text(label = round(random_demo$yValue,3), x = 0, y = random_demo$yValue,
            vjust = 1.5, color = "steelblue", size = 3) +
    theme_bw() +
    theme(panel.background = element_blank(),
         panel.border = element_rect(color = "black", linewidth = 0.8,linetype = "solid"),
          panel.grid = element_blank(),
          plot.title = element_text(hjust = 0.5),
          plot.subtitle = element_text(hjust = 0.5),
          axis.ticks = element_line(linewidth  = 0.5,color="black"),
          axis.line = element_blank())
print(p)

Figure 6C

## 1. MHB AUC score (Sample-level)
tag <- names(MM) 
AUC_MHB_score <- pbapply::pblapply(tag, function(x) {
            cat(x, "\n")
            # MHB methylation data
                methyl  <- MM[[x]] %>% as.data.frame() 
            # MHB variation data
                var  <- metVar[[x]]  %>% as.data.frame() 
            # ROC analysis
                s1 <- names(methyl)
                AUC_Scores <-  lapply(s1,function(z){ 
                        temp <- data.frame("mean" = methyl[[z]],"var"=var[[z]]) %>% 
                                na.omit() %>% arrange(mean)
                        fit <- loessCurve(temp, 0.5) 
                        return(fit$loessDfAUC)
                    })
            # rename 
            names(AUC_Scores) <- s1
            score <- do.call(rbind, AUC_Scores) %>% as.data.frame() %>% 
                            dplyr::rename(AUC = V1) %>% mutate(Type = x) %>%
                            rownames_to_column(var = "ID") %>% 
                            filter(!str_detect( ID %>% str_split_i("-",i=4) ,"11")) %>%
                            mutate(cluster = ifelse(AUC > median(AUC), "High", "Low"))
            return(score)               
    })

ACC 
BLCA 
BRCA 
CESC 
CHOL 
COAD 
COADREAD 
DLBC 
ESCA 
GBM 
GBMLGG 
HNSC 
KICH 
KIPAN 
KIRC 
KIRP 
LAML 
LGG 
LIHC 
LUAD 
LUSC 
MESO 
OV 
PAAD 
PCPG 
PRAD 
READ 
SARC 
SKCM 
STAD 
STES 
TGCT 
THCA 
THYM 
UCEC 
UCS 
UVM

names(AUC_MHB_score) <- tag 
## AUC scores
xx = do.call(rbind,AUC_MHB_score) %>% as.data.frame() %>% arrange(Type, desc(AUC))
## remove normal samples
xx <- xx %>% mutate(cancer = ID %>% str_split_i("-",i=4),
                    TT = ifelse(str_detect(cancer,"11"),"Normal","Tumor")) %>% 
                                filter(TT != "Normal")
## TPM expression
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/AUC/Firehose_Expression_Tumor_Primary.RData")
## AUC MHB variation
AUC_MHB_score <- xx %>% mutate(sample = str_sub(ID,1,12))
cancer_type <- names(Firehose_Expression_Tumor)

## 2. AUC low vs high
TCGA_AUC_DEG <- pbapply::pblapply(cancer_type,function(i){
        ## groupping
      groups= AUC_MHB_score %>% filter(Type == i, str_detect(ID %>% str_sub(13,15), "-01|-03"))
        ## common samples (DNAm & TPM)
        shared_samples <- intersect(colnames(Firehose_Expression_Tumor[[i]]),groups$sample) %>% unique()
      groups <- groups %>% filter(sample %in% shared_samples) %>% 
                        mutate(cluster = ifelse(AUC > median(AUC), "High", "Low"),
                                cluster=fct_inorder(cluster))
        ## data matrix log2(TPM+1) 
        expset <- Firehose_Expression_Tumor[[i]]
        expset <- expset[which(rowMeans(expset)>=1),]
        expset <- expset[,groups$sample]

        ## pheno data
        Group <- groups$cluster
        design<-model.matrix(~Group)

        ## DEG limma fit
        fit<-lmFit(expset,design)
        fit1<-eBayes(fit)
        options(digits = 4)
        all_diff<-topTable(fit1,coef=2,adjust='BH',number = 30000,sort.by = 'logFC')
        all_diff$DEG <- ifelse(all_diff$logFC >=1 & all_diff$adj.P.Val<= 0.05, "Up",
                        ifelse(all_diff$logFC <= -1 & all_diff$adj.P.Val <=0.05, "Down","NC"))
        tT = subset(all_diff,select=c("P.Value","adj.P.Val","logFC","DEG"))
        colnames(tT)=c("P_Value","FDR","logFC","DEG")
        tT <- tT[rownames(expset),]
        matrix <- expset %>% as.data.frame() %>% cbind(tT)
        return(matrix)
})
names(TCGA_AUC_DEG) <- cancer_type

## The number of DEGs
counts<- lapply(cancer_type,function(x){ TCGA_AUC_DEG[[x]] %>% 
                filter(DEG !="NC") %>% count(DEG) %>% mutate(type=x)})
counts <- do.call(rbind,counts) 

## PLOT
p<-ggplot(counts,aes(x=fct_rev(type),y=n,fill=fct_rev(DEG))) +
   geom_bar(stat = "identity",position = position_dodge(width = 0.8),linewidth=0.8) +
   labs(x="",y="Number of genes") +
   scale_fill_brewer(palette = "Set1") +
   theme_bw() +
   theme(panel.background = element_blank(),
        panel.grid = element_blank(),
        axis.text.x =element_text(angle = 90,hjust = 1)) +
    coord_flip()
print(p)

Figure 6D

## 1. AUC score low vs high (DEGs)
gene_list<-lapply(TCGA_AUC_DEG,function(x){
           genes <-  x %>% dplyr::filter(DEG != "NC") %>% 
                            dplyr::select("FDR","logFC","DEG") %>% 
                            rownames_to_column(var="genes")
})
names(gene_list) <- cancer_type
pan_DEGs <- do.call(rbind,gene_list) %>% mutate(Type = rownames(.) %>% str_split_i("\\.",1))

## 2. Hallmark enrichment
m_t2g <- msigdbr(species = "Homo sapiens", category = "H") %>% 
             dplyr::select(gs_name, entrez_gene)
## convert : the existence of genes
pan_genes <-  pan_DEGs %>% group_by(Type) %>% group_split()
hallmarks_enrichment <- lapply(pan_genes,function(x){
    up_genes <- x %>% filter(DEG=="Up") %>% pull(genes) %>% unique()
    down_genes <- x %>% filter(DEG=="Down") %>% pull(genes) %>% unique()
    ## up
    up_genes_entrez = bitr(up_genes,               
                        fromType="SYMBOL",             
                        toType="ENTREZID",              
                        OrgDb="org.Hs.eg.db")               
    ## down
    down_genes_entrez = bitr(down_genes,               
                        fromType="SYMBOL",             
                        toType="ENTREZID",              
                        OrgDb="org.Hs.eg.db")
em_up <- enricher(up_genes_entrez$ENTREZID, TERM2GENE=m_t2g,pAdjustMethod= "BH") %>%
         as.data.frame() %>% mutate(DEG = "Up")
em_down <- enricher(down_genes_entrez$ENTREZID, TERM2GENE=m_t2g,pAdjustMethod= "BH") %>%
         as.data.frame() %>% mutate(DEG = "Down")
em <- rbind(em_up,em_down) %>% filter(p.adjust < 0.05)
return(em)
})
names(hallmarks_enrichment) <- pan_DEGs %>% pull(Type) %>% unique()

## dotplot
df <- do.call(rbind,hallmarks_enrichment) %>% filter(DEG == "Up") %>% 
        mutate(Type =rownames(.) %>% str_split_i("\\.",1))  %>% 
        dplyr::select(ID,GeneRatio,p.adjust,Count,Type) %>% 
        mutate(GeneRatio = (GeneRatio %>% str_split_i("/",1) %>% as.numeric())/(GeneRatio %>% str_split_i("/",2) %>% as.numeric()),
        Count = as.numeric(Count)) %>% arrange(p.adjust)

colors <- colorRampPalette(rev(brewer.pal(n = 11, name = "RdYlBu")))(100)
p<-ggplot(df, aes(x = Type, y = reorder(ID, GeneRatio))) +
  geom_point(aes(size = GeneRatio, color = -log10(p.adjust))) +
    #scale_color_gradient(low = "red", high = "blue") +
    #scale_color_continuous(type = "") +
    theme_bw()  +
    scale_color_gradientn(colors = colors) +
    labs(x="",y="") +
    theme(axis.text.x = element_text(angle = 90),
        axis.text.y = element_text(size=6)) 
print(p)

Figure 6E

## 1. t.test (Mutation vs. WT )
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/AUC/TCGA_Gene_level_MutsigCV.RData")
tag <- names(Gene_level_MutsigCV)

## 2. TCGA  MHB Mut vs WT (Sample-level)
files <- names(MM) 
AUC_sig_MCB <- pbapply::pblapply(file, function(x) {
            ## load Sigificant Mutation genes
            gene_mutation <- Gene_level_MutsigCV[[x]] %>% names()
            
            if (length(gene_mutation)!=0) {
                gene_AUC_Sig <- pbapply::pblapply(gene_mutation, function(y) {
                ## 1. Sample classification
                    ## load sample_Mut
                    sample_Mut_ID <- Gene_level_MutsigCV[[x]] %>%
                                    dplyr::select(all_of(y)) %>% 
                                    dplyr::rename_with(~c("gene")) %>% 
                                    dplyr::filter( gene == 1)  %>% rownames() %>%
                                    str_sub(1,16) 
                    ## load sample_WT
                    sample_WT_ID <-Gene_level_MutsigCV[[x]] %>% 
                                    dplyr::select(all_of(y)) %>% 
                                    dplyr::rename_with(~c("gene")) %>% 
                                    dplyr::filter( gene == 0) %>% rownames() %>% str_sub(1,16)
                #########################################
                ## 2. MHB Methylation data
                # MHB methylation data
                    methyl_WT  <- MM[[x]] %>% as.data.frame() %>% 
                                    dplyr::select(contains(sample_WT_ID))
                    methyl_Mut <- MM[[x]] %>% as.data.frame() %>% 
                                    dplyr::select(contains(sample_Mut_ID))
                # MHB variation data
                    var_WT  <- metVar[[x]]  %>% as.data.frame() %>% 
                                    dplyr::select(contains(sample_WT_ID))
                    var_Mut <- metVar[[x]]  %>% as.data.frame() %>% 
                                    dplyr::select(contains(sample_Mut_ID))
                ## 3. ROC analysis
                # WT
                    s1 <- names(methyl_WT)
                    AUC_WT  <-  lapply(s1,function(z){ 
                            temp <- data.frame("mean" = methyl_WT[[z]],"var"=var_WT[[z]]) %>% na.omit() %>%
                                    arrange(mean)
                            fit <- loessCurve(temp, 0.5) 
                            return(fit$loessDfAUC)
                    })
                # Mut
                    s2 <- names(methyl_Mut)
                    AUC_Mut  <-  lapply(s2,function(z){ 
                            temp <- data.frame("mean" = methyl_Mut[[z]],"var"=var_Mut[[z]]) %>% na.omit() %>%
                                     arrange(mean)
                            fit <- loessCurve(temp, 0.5) 
                            return(fit$loessDfAUC)
                    })
                
                ## 4. test statistic
                    t1 <- as.numeric(AUC_WT)
                    t2 <- as.numeric(AUC_Mut)
                    p <- ifelse(length(t1) <3 || length(t2) <3, NA,t.test(t1,t2) %>% broom::tidy() %>% dplyr::select(p.value) %>% as.numeric() )
                    fc <- mean(t2) - mean(t1)
                    return(c(mean(t2),mean(t1),p,fc))
            })
            names(gene_AUC_Sig) <- gene_mutation
            gene_significant <- do.call(rbind, gene_AUC_Sig) %>% 
                                as.data.frame() %>% 
                                dplyr::rename(AUC_Mut = V1 ,AUC_WT = V2,p.value = V3, FC = V4) %>% mutate(FDR = p.adjust(p.value, method = "fdr", n = length(p.value)))
            return(gene_significant)
            }else {
               NA
            }
        })
names(AUC_sig_MCB) <- file 

xx = do.call(rbind,AUC_sig_MCB) %>% as.data.frame() %>% 
        rownames_to_column(var = "gene") %>% 
        separate(gene, into = c("cancer", "gene"), sep = "\\.", remove = FALSE)
## save
write.table(xx, file = paste0("AUC_MHB_Callable/TCGA_Mutation_",file,"_AUC_MHB_ttest_sig.txt"),sep = "\t",row.names = FALSE,quote = FALSE)

path <- "/sibcb1/bioinformatics/zhangzhiqiang/MHB_Cancer/Mut_Var/AUC_MHB_Callable"
files <- list.files(path = path,pattern = "MHB_ttest_sig.txt")
test.data <- lapply(files,function(x){
                   read.table(paste0(path,x),header = T,sep = "\t") %>% 
                   na.omit() %>% 
                   filter(p.value <= 0.05)})

res <- do.call(rbind,test.data) %>% as.data.frame() %>% 
       mutate(type = ifelse(FC > 0, "increased","decreased")) %>% 
       mutate(ID = str_c(cancer,"_",gene))
## volcano plot
res1 <- res %>% filter(abs(FC) > 0.0004,p.value <= 0.0001)
p<-ggplot(res,aes(x = 1000*FC,y = -log10(FDR))) + 
   geom_point(shape = 16,size = 1.5,aes(color = type)) +
   xlim(c(-12,12)) +
   scale_y_continuous(expand = c(0,0),limits = c(0,13)) +
   scale_color_brewer(palette = "Set1",direction = -1) +
   theme_classic() +
   labs(x = "Fold Change (10-3)",y = "-log10(q Value)") + 
   theme(panel.background = element_blank(),panel.grid =element_blank(),
         axis.ticks.length=unit(5,'pt'),
         axis.text.x =element_text(angle=0,vjust=0.5), 
         plot.title=element_text(hjust=0.5))  + 
   ggrepel::geom_text_repel(data = res1,aes(x = 1000*FC,
                           y = -log10(FDR),label = ID),
                           color = "black",size = 2,
                           max.overlaps = 100000)
print(p)

Figure 6G

# Show the MHB AUC of GSE50774 (GBMLGG)
## 1. load GSE50774 data 
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/MHB_correlation/RnBeads_450K_hg19_Probes_GR.RData")
RnBeads_450K_hg19_GR = RnBeads_450K_hg19_GR[RnBeads_450K_hg19_GR$QC]
mean0 = function(z) mean(z, na.rm = TRUE)

load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/AUC/GSE50774.RData")
meta_info = meta %>% filter(str_detect(`idh1/h3f3a mutation status:ch1`,"IDH1")) %>% 
                    mutate(mutation = ifelse(str_detect(`idh1/h3f3a mutation status:ch1`,"MUT"), "Mut", "WT"))
WT_sample_id <- meta_info %>% filter(mutation == "WT") %>% rownames()
Mut_sample_id <- meta_info %>% filter(mutation == "Mut") %>% rownames()
## MHB
GBM_MCB = toGRanges("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/regions/MHB_Call_able.cpg.bed")

## methylation 
ArrayIntervalMethylation <- function(MM, IntervalGR, cpgGR){
  # check seqlevels
  if(length(intersect(seqlevels(IntervalGR), seqlevels(cpgGR))) == 0)
    stop("No shared seqlevels found.\n")
  # only keep CpGs that locate in intervals
  Idx1 = countOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE) > 0
  Idx2 = names(cpgGR) %in% rownames(MM)
  Idx  = Idx1 & Idx2
  if(sum(Idx) == 0)
    stop("No CpGs found in pre-defined regions.\n")
  cpgGR = cpgGR[Idx, ]
  # keep shared CpGs
  cNames = intersect(rownames(MM), names(cpgGR))
  cpgGR  = cpgGR[cNames]
  MM    = MM[cNames, ]
  # Sample-by-sample processing to control memory 
  mergedM = matrix(NA, length(IntervalGR), ncol(MM))
  x = findOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE)
  MM.adj <- MM[queryHits(x),]
  aT  = aggregate(MM.adj, by = list(factor(subjectHits(x))), FUN="mean0")
  aID = as.character(aT[,1])
  aMM = as.matrix(aT[,-1])
  mergedM[as.integer(aID), ] = aMM
  colnames(mergedM) = colnames(MM)
  ## return
  return(mergedM)
}
## variation
var0 <- function(x) ifelse(length(x)>=3, var(x,na.rm=T),NA)
ArrayIntervalMethylVar <- function(MM, IntervalGR, cpgGR){
  # check seqlevels
  if(length(intersect(seqlevels(IntervalGR), seqlevels(cpgGR))) == 0)
    stop("No shared seqlevels found.\n")
  # only keep CpGs that locate in intervals
  Idx1 = countOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE) > 0
  Idx2 = names(cpgGR) %in% rownames(MM)
  Idx  = Idx1 & Idx2
  if(sum(Idx) == 0)
    stop("No CpGs found in pre-defined regions.\n")
  cpgGR = cpgGR[Idx, ]
  # keep shared CpGs
  cNames = intersect(rownames(MM), names(cpgGR))
  cpgGR  = cpgGR[cNames]
  MM    = MM[cNames, ]
  # Sample-by-sample processing to control memory
  mergedM = matrix(NA, length(IntervalGR), ncol(MM))
  x = findOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE)
  MM.adj <- MM[queryHits(x),]
    aT  = aggregate(MM.adj, by = list(factor(subjectHits(x))), FUN="var0")
  aID = as.character(aT[,1])
  aMM = as.matrix(aT[,-1])
  mergedM[as.integer(aID), ] = aMM
  colnames(mergedM) = colnames(MM)
  ## return
  return(mergedM)
}
## methylatin & variation
GBM_Methylation = ArrayIntervalMethylation(MeM, GBM_MCB, RnBeads_450K_hg19_GR)
GBM_MethylVar = ArrayIntervalMethylVar(MeM, GBM_MCB, RnBeads_450K_hg19_GR)

## 2. MHB AUC analysis
    # MHB methylation data
methyl_WT  <- GBM_Methylation %>% as.data.frame() %>% dplyr::select(all_of(WT_sample_id)) %>%
             mutate(ID=1:nrow(.)) %>% pivot_longer(-ID, names_to = "sample", values_to = "mean")
methyl_Mut <- GBM_Methylation %>% as.data.frame() %>% dplyr::select(all_of(Mut_sample_id)) %>% 
            mutate(ID=1:nrow(.)) %>% pivot_longer(-ID, names_to = "sample", values_to = "mean")
    # MHB variation data
var_WT  <- GBM_MethylVar %>% as.data.frame() %>% dplyr::select(all_of(WT_sample_id)) %>% 
            mutate(ID=1:nrow(.))  %>% pivot_longer(-ID, names_to = "sample", values_to = "var")
var_Mut <- GBM_MethylVar %>% as.data.frame() %>% dplyr::select(all_of(Mut_sample_id)) %>% 
            mutate(ID=1:nrow(.))  %>% pivot_longer(-ID, names_to = "sample", values_to = "var")
    # Merged
WT <- methyl_WT %>% inner_join(var_WT, by = c("ID","sample")) %>% na.omit() %>% arrange(mean)  %>% 
            mutate(group = cut_interval(mean,length = 0.005)) %>% 
            group_by(group) %>% summarise(mean = mean(mean), var = mean(var))  %>% 
            ungroup() 
Mut <- methyl_Mut %>% inner_join(var_Mut, by = c("ID","sample")) %>% na.omit() %>% arrange(mean)  %>%
            mutate(group = cut_interval(mean,length = 0.005)) %>% 
            group_by(group) %>% summarise(mean = mean(mean), var = mean(var))  %>% 
            ungroup()           
## 3. ROC analysis
    # WT
    mcb_WT <- loessCurve(WT, 0.5)
    # Mut
    mcb_Mut <- loessCurve(Mut, 0.5)
## 4. t.test
    set_WT <-  data.frame(MM=mcb_WT$xloessDfVals,var=mcb_WT$yloessDfVals,type="WT") 
    set_Mut <-  data.frame(MM=mcb_Mut$xloessDfVals,var=mcb_Mut$yloessDfVals,type="Mut")
    dataset1 <- rbind(set_WT, set_Mut)
## 5. plot
p<-ggplot(dataset1, aes(x=MM, y=var, color=type)) +
    geom_line(aes(color=type)) +
    scale_color_manual(values = c("Mut" = "firebrick", "WT" = "steelblue")) +
    geom_area(aes(fill = type),alpha = 0.3,position="identity") +
    scale_fill_manual(values = c("Mut" = "firebrick", "WT" = "steelblue")) +
    ggtitle(paste0("GBM (GSE50774)")) + 
    labs(x = "Methylation", y = "Variation") +
    scale_y_continuous(expand=c(0,0),limits = c(0,0.05)) +
    annotate("text",x=0.45,y=0.045,label=paste0("WT-IDH1 AUC = ",round(mcb_WT$loessDfAUC,3)),size=3,color="steelblue") + 
    annotate("text",x=0.45,y=0.04,label=paste0("Mut-IDH1 AUC = ",round(mcb_Mut$loessDfAUC,3)),size=3,color="firebrick") +
    geom_segment(x = 0.5, xend = 0.5, y = 0, yend = mcb_Mut$yValue,                  
               linetype = "dashed", color = "gray",size=0.5) +
    geom_segment(x = 0, xend = 0.5, y = mcb_WT$yValue, yend = mcb_WT$yValue,                  
               linetype = "dashed", color = "gray",size=0.5) +
    geom_segment(x = 0, xend = 0.5, y = mcb_Mut$yValue, yend = mcb_Mut$yValue,                  
               linetype = "dashed", color = "gray",size=0.5) +  
    geom_text(label = round(mcb_Mut$yValue,3), x = 0, y = mcb_Mut$yValue,
            vjust = 1.5, color = "firebrick", size = 3) +                  
    geom_text(label = round(mcb_WT$yValue,3), x = 0, y = mcb_WT$yValue,
            vjust = 1.5, color = "steelblue", size = 3) +
    theme_bw() +
    theme(panel.background = element_blank(),
         panel.border = element_rect(color = "black", linewidth = 0.8,linetype = "solid"),
          panel.grid = element_blank(),
          plot.title = element_text(hjust = 0.5),
          plot.subtitle = element_text(hjust = 0.5),
          axis.ticks = element_line(linewidth  = 0.5,color="black"),
          axis.line = element_blank())
    
print(p)