Cancer MHBs are enriched in open chromatin

Figure 2. Cancer MHBs are enriched in accessible chromatin regions. (A) Overlapping of cancer MHBs with regions of open chromatin in matched cancer types. ATAC-seq peaks were defined previously and downloaded from NCI’s Genomic Data Commons40. (B) Annotation of MHBs to regions of DNA methylation states, including UMR, LMR, PDR, and HMR. (C-D) The enrichment of MHBs in LMR and PMD, respectively. Enrichment test was performed by R package LOLA, with the union of MHBs serving as the background set. The resulting adjusted P-values for significant enrichment were ranked across all cancer types (FDR < 0.01). Non-significant results are shown in grey. (E) Genomic regions with MHBs are more enriched in regions of open chromatin when controlling for mean methylation levels. The enrichment score was calculated as the percentage of CpGs covered by open chromatin regions. (F) Enrichment of MHBs in ChIA-PET in a disease status-specific manner. The enrichment score was calculated as the ratio of observed to expected overlaps between MHBs and ChIA-PET regions with permutation test.

loading required packages:

library(tidyverse)
library(stringr)
library(data.table)
library(ggplot2)
library(pheatmap)
library(regioneR)
library(parallel)
library(rtracklayer)
library(MethylSeekR)
library(BSgenome.Hsapiens.UCSC.hg19)
library(GenomicRanges)
library(LOLA)

Identification of PMDs, LMRs, UMRs

# Input CpGs
data_path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/Methyl_bdg/"
samples=list.files(data_path,pattern="Count_CpG.bedGraph.gz")
# hg19 CGI
      session <- browserSession()
      genome(session) <- "hg19"
      query <- ucscTableQuery(session, table = "cpgIslandExt")
      CpGislands.gr <- track(query)
      genome(CpGislands.gr) <- NA

# Remove CGI +/-5K CpGs
      CpGislands.gr <-suppressWarnings(resize(CpGislands.gr, 5000, fix="center"))
    
for ( i in samples){
      # Load Methylation
      x <- fread(paste0(data_path,i)) %>% toGRanges()
      names(mcols(x)) = c("M","Um")
      mcols(x)[,"T"] = mcols(x)[,1] + mcols(x)[,2]
      ranges(x) <- end(x)
      mcols(x) <- mcols(x)[,c("T","M")]
      tag <- gsub("_Merged_Count_CpG.bedGraph","",i)
      # PMD
      PMDsegments<-segmentPMDs(m=x, chr.sel="chr22",seqLengths=sLengths,num.cores=10)
      # FDR cut-off
      stats <- suppressWarnings(calculateFDRs(m=x, CGIs=CpGislands.gr,
                            PMDs=PMDsegments, num.cores=10))
      FDR.cutoff <- 5
      m.sel <- 0.5
      n.sel=as.integer(names(stats$FDRs[as.character(m.sel), ]
            [stats$FDRs[as.character(m.sel), ]<FDR.cutoff])[1])
      # UMR LMR
      UMRLMRsegments <- segmentUMRsLMRs(m=x, meth.cutoff=m.sel,
                        nCpG.cutoff=n.sel, PMDs=PMDsegments,
                        num.cores=10, myGenomeSeq=Hsapiens,minCover=5,
                        seqLengths=sLengths)
      # Saving
      # PMD
      write.table(as.data.frame(PMDsegments[PMDsegments$type=="PMD"])[c(1:3)],file=paste0("res/",tag,"_PMDs.txt"),
                sep="\t",quote=F,col.names=F,row.names=F)
      # LMR & UMR 
      write.table(granges(UMRLMRsegments[UMRLMRsegments$type=="LMR",]),file=paste0("res/",tag,"_LMRs.txt"),
                sep="\t",quote=F,col.names=F,row.names=F)
      write.table(granges(UMRLMRsegments[UMRLMRsegments$type=="UMR",]),file=paste0("res/",tag,"_UMRs.txt"),
                sep="\t",quote=F,col.names=F,row.names=F)
      }

Figure 2A

# The Frequency of MHB in ATAC-peaks
cancer_types <- list.files("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/") %>% 
                  str_split_i(pattern="[_.]",i=2) %>% setdiff(c("OV","PACA"))
mn = list()
for ( sample in cancer_types ){
    mhb_peak = fread(paste0("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/MHB_",sample,".bed.gz")) %>%
                toGRanges()
    mhb_tag = sample
    
    # Read the TCGA ATAC-seq Peaks
      ATAC_path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/ATAC/"
            if (mhb_tag == "BRCA"){
                  i = "BRCA_hg19_peak.bed.gz"
            }else if(mhb_tag == "CESC"){
                  i = "CESC_hg19_peak.bed.gz"
            }else if (mhb_tag == "COAD"){
                  i = "COAD_hg19_peak.bed.gz"
            }else if (mhb_tag == "ESCA"){
                  i = "ESCA_hg19_peak.bed.gz"
            }else if (mhb_tag =="HNSC"){
                  i = "HNSC_hg19_peak.bed.gz"
            }else if (mhb_tag == "LIHC"){
                  i = "LIHC_hg19_peak.bed.gz"
            }else if(mhb_tag == "NSCLC"){
                  i = "LUAD_hg19_peak.bed.gz"
            }else if (mhb_tag == "STAD"){
                  i = "STAD_hg19_peak.bed.gz"
            }else if(mhb_tag == "THCA"){
                  i = "THCA_hg19_peak.bed.gz"
        }
    tag = strsplit(i,"_")[[1]][1]
    # ATAC-seq peaks
    assign(tag,fread(paste0(ATAC_path,i)) %>% toGRanges())
    # Cancer specific MHB
    x=findOverlaps(get(tag),mhb_peak,type = "any", ignore.strand = TRUE)
    
    # Overlapping
    cM=length(unique(subjectHits(x)))
    # total MHB 
    cL=length(mhb_peak)
    m = round(100*cM/cL,2)
    mn[[tag]] = m
}
# add LUSC
    i="LUSC_hg19_peak.bed.gz"
    tag = strsplit(i,"_")[[1]][1]
    assign(tag,fread(paste0(ATAC_path,i)) %>% toGRanges())
    x=findOverlaps(get(tag),mhb_peak,type = "any", ignore.strand = TRUE)
    cM=length(unique(subjectHits(x)))
    cL=length(mhb_peak)
    m = round(100*cM/cL,0)
    mn[[tag]] = m

MHB_Ov_ATAC=as.data.frame(do.call(rbind,mn))
MHB_Ov_ATAC$type = rownames(MHB_Ov_ATAC)
MHB_Ov_ATAC = MHB_Ov_ATAC[order(rownames(MHB_Ov_ATAC)),]

# PLOT
p<-ggplot(MHB_Ov_ATAC,aes(type,V1)) + 
        geom_bar(stat="identity",fill="steelblue",width=0.8)+
        scale_y_continuous(expand=c(0,0),limits=c(0,80))+
        labs(y="The Frequency of Cancer-type MHBs \n Overlap with TCGA ATAC peaks (%)")+
        theme_bw() +
        theme(panel.grid= element_blank(),panel.background=element_blank(),
              panel.border=element_blank(),axis.line=element_line(color="black"),
              axis.title.x=element_blank(),
              axis.text.x = element_text(color="black",size=10,angle=45,vjust=1,hjust=1),
              axis.text.y = element_text(color="black",size=10)) +
        geom_text(aes(label=paste0(V1,"%")), vjust=-0.3, size=3.5,color="black")
print(p)

Figure 2B

# The Frequency of MHB in genomic features.(PMD, LMR, UMR, HMR)
cancer_type = list.files("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/")
mn =list()
for (i in cancer_type ){
      
    mhb = fread(paste0("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/",i)) %>% toGRanges()
    mhb_tag = i %>% str_split_i(pattern="[_.]",i=2)

      # Read the cancer type PMD HMR LMR UMR regions
      MR_path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/genomic_segments/"
      if (mhb_tag == "BRCA"){
            tag = "breast_T"
      }else if(mhb_tag == "CESC"){
            tag = "cervix_T"
      }else if (mhb_tag == "COAD"){
            tag = "colon_T"
      }else if (mhb_tag == "ESCA"){
            tag = "esophagus_T"
      }else if (mhb_tag =="HNSC"){
            tag = "head_and_neck_T"
      }else if (mhb_tag == "LIHC"){
            tag = "liver_T"
      }else if(mhb_tag == "NSCLC"){
            tag = "lung_T"
      }else if (mhb_tag == "STAD"){
            tag = "stomach_T"
      }else if(mhb_tag == "THCA"){
            tag = "thyroid_T"
      }else if(mhb_tag == "OV"){
            tag = "ovary_T"
      }else if(mhb_tag == "PACA"){
            tag = "pancreas_T"
      }
      
    # Genomic feature regions
    GFR = fread(paste0(MR_path,tag,"_genomic_segments.bed.gz")) %>% toGRanges()
   
    # Cancer-type MHB overlaps
    # PMD
    x1=findOverlaps(subset(GFR,V4=="PMD"),mhb,type = "any", ignore.strand = TRUE)
    m1=length(unique(subjectHits(x1)))
    # HMR
    x2=findOverlaps(subset(GFR,V4=="HMR"),mhb,type = "any", ignore.strand = TRUE)
    m2=length(unique(subjectHits(x2)))
    # LMR
    x3=findOverlaps(subset(GFR,V4=="LMR"),mhb,type = "any", ignore.strand = TRUE)
    m3=length(unique(subjectHits(x3)))
    # UMR
    x4=findOverlaps(subset(GFR,V4=="UMR"),mhb,type = "any", ignore.strand = TRUE)
    m4=length(unique(subjectHits(x4)))

    m = data.frame(freq = c(m1,m2,m3,m4)/sum(m1+m2+m3+m4))
    rownames(m) = c("PMD","HMR","LMR","UMR")
    names(m) = tag
    mn[[tag]] = m
}

res = do.call(cbind,mn) %>% as.data.frame() %>% rownames_to_column(var="segments") %>% 
                        pivot_longer(!segments,names_to="variable",values_to="value")
# PLOT
p<-ggplot(res,aes(x=variable,y=value*100,fill=factor(segments,level=c("HMR","PMD","LMR","UMR"))))+
      geom_bar(stat="identity",position="stack")+
      scale_fill_brewer(palette = "Set1") +
      ggtitle("The Genomic distribution of Cancer-type MHBs")+
      labs(x="",y="Percent (%)") +
      theme_bw()+
      theme(panel.grid = element_blank(),panel.background=element_blank(),
            axis.ticks.length=unit(5,'pt'),axis.line=element_line(color="black"),
            axis.title.x=element_blank(),
            axis.text.x = element_text(color="black",size=10,angle=45,vjust=1,hjust=1),
            axis.text.y = element_text(color="black",size=10),
            axis.ticks = element_line(color="black")) + 
      guides(fill=guide_legend(title=NULL))
print(p)

Figure 2C-2D

mhb = "/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/"
background="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed.gz"
files = list.files(mhb)
# LOLA enrichment
locResults = lapply(files,function(i){
      # Input
      mhb=fread(paste0(mhb,i)) %>% toGRanges()
      # Universe_Sets Background
      universe_set=fread(background) %>% toGRanges()
      # Genomic features
      states=loadRegionDB("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/genomic_segments/LOLA")
      # Runing LOLA
      runLOLA(mhb, universe_set, states, cores=10)
  })
names(locResults) <-files

# enrichment output
res = lapply(files,function(i){
    mx = locResults[[i]][,c("qValue","filename")]
    mx$Type = i
    mx 
})
data <- as.data.frame(do.call(rbind,res))

# filter FDR < 0.01
data <- data %>% mutate(qValue=ifelse(qValue<=0.01,qValue,NA),qValue=-log10(qValue)) %>%
                  pivot_wider(names_from=filename,values_from=qValue)

# significance ranking
rank_t = function(x) {
    m = rank(array(-x),na.last="keep",ties.method = "min")
    return(m)
}

# LMR
lmr <- as.data.frame(t(apply(data %>% dplyr::select(contains("LMR")),1,rank_t)))
rownames(lmr) <- files
names(lmr) <- data %>% dplyr::select(contains("LMR")) %>% names()
lmr <- lmr %>% dplyr::select(sort(names(.)))
names(lmr) <- str_split_i(names(lmr),"\\.",i=1)
rownames(lmr) <- str_split_i(rownames(lmr),"\\.",i=1)

# PMD
pmd <- as.data.frame(t(apply(data %>% dplyr::select(contains("PMD")),1,rank_t)))
rownames(pmd) <- files 
names(pmd) <- data %>% dplyr::select(contains("PMD")) %>% names() 
pmd <- pmd %>% dplyr::select(sort(names(.)))
names(pmd) <- str_split_i(names(pmd),"\\.",i=1)
rownames(pmd) <- str_split_i(rownames(pmd),"\\.",i=1)

# PLOT LMR, PMD
p1<- pheatmap(lmr,
             main="Cancer MHBs vs. LMRs",
             show_colnames=T,show_rownames=T,
             cluster_rows=F,cluster_cols=F,
             scale="none",angle_col="90",
             color=colorRampPalette(colors = c("firebrick","white","darkblue"))(1000))
print(p1)
p2<- pheatmap(pmd,
             main="Cancer MHBs vs. PMDs",
             show_colnames=T,show_rownames=T,
             cluster_rows=F,cluster_cols=F,
             scale="none",angle_col="90",
             color=colorRampPalette(colors = c("firebrick","white","darkblue"))(1000))
print(p2)

Figure 2E

# Cancer Type ATAC-seq Peak
ATAC_path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/ATAC/"
# Genomic segments
segments_path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/genomic_segments"
# CpG sites
hg19_CpG="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/genomic_segments/hg19_CpG.bed"

for i in `ls ${segments_path}/*_T_MHB_genomic_segments.bed`
do
      a=${i##*/}
    if [[ "${a}" == *"breast"* ]];then
        tag="BRCA_hg19_peak.bed"
    elif [[ "${a}" == *"cervix"* ]];then
        tag="CESC_hg19_peak.bed"
    elif [[ "${a}" == *"colon"* ]];then
        tag="COAD_hg19_peak.bed"
    elif [[ "${a}" == *"esophagus"* ]];then
        tag="ESCA_hg19_peak.bed"
    elif [[ "${a}" == *"head_and_neck"* ]];then
        tag="HNSC_hg19_peak.bed"
    elif [[ "${a}" == *"lung"* ]];then
        tag="LUAD_hg19_peak.bed"
    elif [[ "${a}" == *"liver"* ]];then
        tag="LIHC_hg19_peak.bed"
    elif [[ "${a}" == *"stomach"* ]];then
        tag="STAD_hg19_peak.bed"
    elif [[ "${a}" == *"thyroid"* ]];then
        tag="THCA_hg19_peak.bed"
    fi
    # MHB LMR UMR PMD HMR
    less ${i} |grep MHB | bedtools intersect -a ${hg19_CpG} -b - | bedtools coverage -a -  -b \
                ${ATAC_path}/${tag}  > ${tag%%_*}_segments_ATAC_MHB_coverage.bed 
    less ${i} |grep LMR | bedtools intersect -a ${hg19_CpG} -b - | bedtools coverage -a -  -b \
                ${ATAC_path}/${tag}  > ${tag%%_*}_segments_ATAC_LMR_coverage.bed 
    less ${i} |grep UMR | bedtools intersect -a ${hg19_CpG} -b - | bedtools coverage -a -  -b \
                ${ATAC_path}/${tag}  > ${tag%%_*}_segments_ATAC_UMR_coverage.bed 
    less ${i} |grep PMD | bedtools intersect -a ${hg19_CpG} -b - | bedtools coverage -a -  -b \
                ${ATAC_path}/${tag}  > ${tag%%_*}_segments_ATAC_PMD_coverage.bed 
    less ${i} |grep HMR | bedtools intersect -a ${hg19_CpG} -b - | bedtools coverage -a -  -b \
                ${ATAC_path}/${tag}  > ${tag%%_*}_segments_ATAC_HMR_coverage.bed 
done
# add LUSC
    less ${segments_path}/lung_T_MHB_genomic_segments.bed |grep MHB | bedtools intersect -a ${hg19_CpG} -b - | \
             bedtools coverage -a -  -b ${ATAC_path}/LUSC_hg19_peak.bed  >LUSC_segments_ATAC_MHB_coverage.bed 
    less ${segments_path}/lung_T_MHB_genomic_segments.bed |grep LMR | bedtools intersect -a ${hg19_CpG} -b - | \
             bedtools coverage -a -  -b ${ATAC_path}/LUSC_hg19_peak.bed  > LUSC_segments_ATAC_LMR_coverage.bed 
    less ${segments_path}/lung_T_MHB_genomic_segments.bed |grep UMR | bedtools intersect -a ${hg19_CpG} -b - | \
             bedtools coverage -a -  -b ${ATAC_path}/LUSC_hg19_peak.bed  > LUSC_segments_ATAC_UMR_coverage.bed 
    less ${segments_path}/lung_T_MHB_genomic_segments.bed |grep PMD | bedtools intersect -a ${hg19_CpG} -b - | \
             bedtools coverage -a -  -b ${ATAC_path}/LUSC_hg19_peak.bed  > LUSC_segments_ATAC_PMD_coverage.bed 
    less ${segments_path}/lung_T_MHB_genomic_segments.bed |grep HMR | bedtools intersect -a ${hg19_CpG} -b - | \
             bedtools coverage -a -  -b ${ATAC_path}/LUSC_hg19_peak.bed  > LUSC_segments_ATAC_HMR_coverage.bed 

The Genomic segments overlap with TCGA ATAC-seq peaks in CpG base pairs level

path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/ATAC_segment_coverage/"
samples=list.files(path,pattern="_MHB_coverage.bed")
# Methylation
beta_path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/Methyl_bdg/"
CpG_segment_ATAC_rate<-lapply(samples,function(x){
    if(str_detect(x,"BRCA")){
        beta<-paste0(beta_path,"breast_T_Merged_Count_CpG.bedGraph.gz")
    }else if(str_detect(x,"CESC")){
        beta<-paste0(beta_path,"cervix_T_Merged_Count_CpG.bedGraph.gz")
    } else if(str_detect(x,"COAD")){
        beta<-paste0(beta_path,"colon_T_Merged_Count_CpG.bedGraph.gz")
    } else if(str_detect(x,"ESCA")){
        beta<-paste0(beta_path,"esophagus_T_Merged_Count_CpG.bedGraph.gz")
    } else if(str_detect(x,"HNSC")){
        beta<-paste0(beta_path,"head_and_neck_T_Merged_Count_CpG.bedGraph.gz")
    } else if(str_detect(x,"LIHC")){
        beta<-paste0(beta_path,"liver_T_Merged_Count_CpG.bedGraph.gz")
    } else if(str_detect(x,"LUAD")){
        beta<-paste0(beta_path,"lung_T_Merged_Count_CpG.bedGraph.gz")
    } else if(str_detect(x,"LUSC")){
        beta<-paste0(beta_path,"lung_T_Merged_Count_CpG.bedGraph.gz")
    } else if(str_detect(x,"STAD")){
        beta<-paste0(beta_path,"stomach_T_Merged_Count_CpG.bedGraph.gz")
    } else if(str_detect(x,"THCA")){
        beta<-paste0(beta_path,"thyroid_T_Merged_Count_CpG.bedGraph.gz")
    }
    # read mhb data 
    mhb_coverage<- fread(paste0(path,x)) %>% 
                    dplyr::select(c(1:3)) %>% mutate(MHB="MHB")
    # Genomic features were divided into MHB overlapping or not  
    gf <- c("LMR","UMR","PMD","HMR")
    Genomic_coverage <- lapply(gf, function(m) { 
                            tag<-str_replace_all(x,"MHB",m)
                            fread(paste0(path,tag)) %>% dplyr::select(c(1:3,6,7)) %>% mutate(Type=m) %>%
                            left_join(mhb_coverage,by=c("V1","V2","V3")) %>% 
                            mutate(MHB = ifelse(is.na(MHB),paste0(m,"_NonMHB"),paste0(m,"_MHB")))
                            })
    coverage <- do.call(rbind,Genomic_coverage) %>% as.data.frame()
    MM <- fread(beta) %>% transmute(V1=V1,V2=V2,V3=V3,V4=ifelse(V4+V5>=10,round(V4/(V4+V5),2),NA))
    # merge
    left_join(coverage,MM,by=intersect(names(coverage),names(MM))) %>% drop_na()
})
names(CpG_segment_ATAC_rate) <- samples %>% str_split_i(pattern="_",i=1)

# In cancer type 
Cancer_segment_ATAC<- lapply(names(CpG_segment_ATAC_rate),function(i){
        CpG_segment_ATAC_rate[[i]] %>% dplyr::rename("olen"="V6","tlen"="V7","Beta"="V4") %>% arrange(Beta)
} )
names(Cancer_segment_ATAC) <- names(CpG_segment_ATAC_rate)

# All the Cancer merged
    Tx=do.call(rbind,Cancer_segment_ATAC)
    Tx$Tag = sapply(strsplit(rownames(Tx),"\\."),function(x) x[1])
    ### HMR
    tHMR = Tx[Tx$Type == "HMR",] %>% arrange(Beta)
    tHMR$Cluster = paste0("cluster_",rep(1:10,each=trunc(nrow(tHMR)/10)+1))[1:nrow(tHMR)]
    trHMR = as.data.frame(tHMR[c("olen","tlen","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=sum))
    cluster_tHMR = as.data.frame(tHMR[c("Beta","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=median))
    trHMR = inner_join(trHMR,cluster_tHMR,by=c("Cluster","MHB")) %>% mutate(Freq=100*round(olen/tlen,4))
    ### PMD 
    tPMD = Tx[Tx$Type == "PMD",] %>% arrange(Beta)
    tPMD$Cluster = paste0("cluster_",rep(1:10,each=trunc(nrow(tPMD)/10)+1))[1:nrow(tPMD)]
    trPMD = as.data.frame(tPMD[c("olen","tlen","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=sum))
    cluster_tPMD = as.data.frame(tPMD[c("Beta","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=median))
    trPMD = inner_join(trPMD,cluster_tPMD,by=c("Cluster","MHB")) %>% mutate(Freq=100*round(olen/tlen,4))
    ### LMR
    tLMR = Tx[Tx$Type == "LMR",] %>% arrange(Beta)
    tLMR$Cluster = paste0("cluster_",rep(1:10,each=trunc(nrow(tLMR)/10)+1))[1:nrow(tLMR)]
    trLMR = as.data.frame(tLMR[c("olen","tlen","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=sum))
    cluster_tLMR = as.data.frame(tLMR[c("Beta","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=median))
    trLMR = inner_join(trLMR,cluster_tLMR,by=c("Cluster","MHB")) %>% mutate(Freq=100*round(olen/tlen,4))
    ###UMR
    tUMR = Tx[Tx$Type == "UMR",] %>% arrange(Beta)
    tUMR$Cluster = paste0("cluster_",rep(1:10,each=trunc(nrow(tUMR)/10)+1))[1:nrow(tUMR)]
    trUMR = as.data.frame(tUMR[c("olen","tlen","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=sum))
    cluster_tUMR = as.data.frame(tUMR[c("Beta","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=median))
    trUMR = inner_join(trUMR,cluster_tUMR,by=c("Cluster","MHB")) %>% mutate(Freq=100*round(olen/tlen,4))

    TMR= do.call(rbind,list("UMR"=trUMR,"LMR"=trLMR,"PMD"=trPMD,"HMR"=trHMR))
    TMR$Segment = factor(sapply(strsplit(rownames(TMR),"\\."),function(x) x[1]),levels=c("UMR","LMR","PMD","HMR"))
save(TMR,file="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/Genomic_segments_ATAC_MHB.RData")
# load pre-processed data
load("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/Genomic_segments_ATAC_MHB.RData")
p<- TMR %>% mutate(Type=str_split_i(MHB,pattern="_",i=2),Segment=fct_inorder(Segment)) %>% 
        ggplot(aes(x=Beta,y=Freq,color=Type)) + 
        geom_smooth(aes(color=Type),se=F,linewidth=0.8) +
        scale_y_continuous(expand=c(0,0),limits=c(0,80))+
        scale_x_continuous(limits=c(0,1))+
        scale_color_manual(values=c("#df8f44","#00a1d5")) + 
        labs(x="Mean Methylation",y="The Percentage of CpGs covered by TCGA ATAC-seq Peaks(%)",
            title="Pan-Cancer") +
        theme_bw()+
        theme(panel.grid = element_blank(),panel.background = element_blank(),
            axis.ticks.length=unit(5,'pt'), axis.line=element_line(color="black"),
            plot.title = element_text(hjust=0.5,vjust=0.5),
            strip.background.x = element_rect(color="black",fill="gray", size=0.8, linetype="solid")) +
        geom_vline(aes(xintercept = 0.05),colour="black",linetype="dashed") +
        facet_wrap(~Segment,scales="fixed")
print(p)

Figure 2F

Enrichment of MHBs in ChIA-PET by permutation test, the ChIApet_enrichment.R script can be downloaded from here.

Rscript ChIApet_enrichment.R  \
                --forward MCF-7_BRCA_ChIA_PET_Forward.bed \
                --reverse MCF-7_BRCA_ChIA_PET_Reverse.bed \
                --genome hg19.chrom.sizes \
                --testBED  BRCA_MHB \
                --nTimes  1000 \
                --tmp ./  \
                --outFile  BRCA_MHB_ChIA_enrichment.txt
done
# Enrichment score ChIA-PET MCF7/MCF-10A
# The scores are estimated by the fold enrichment in Pol II ChIA-PET of between MHB and random regions.
MCF7_N=28.17;MCF7_T=40.06
MCF10A_N=60;MCF10A_T=53.62

data = data.frame(MCF7=c(MCF7_N,MCF7_T),MCF10A=c(MCF10A_N,MCF10A_T)) %>% 
        mutate(Type=factor(c("Normal_MHB","Tumor_MHB"),levels=c("Tumor_MHB","Normal_MHB"))) %>% 
        pivot_longer(!Type,names_to="variable",values_to="value")

p<-ggplot(data,aes(x=variable,y=value,fill=Type))+
    geom_bar(stat="identity", position=position_dodge()) +
    scale_fill_brewer(palette="Set1") +
    labs(y="Enrichment Score")+
    theme_bw()+
    theme(panel.grid= element_blank(),panel.background=element_blank(),
          panel.border=element_blank(),axis.line=element_line(color="black"),
          axis.title.x=element_blank(),axis.text.x = element_text(color="black",size=10,
                                                                  angle=45,vjust=1,hjust=1),
          axis.text.y = element_text(color="black",size=10),
          axis.ticks = element_line(color="black"),axis.ticks.length = unit(5, "pt"))+
    geom_text(aes(label=value), vjust=-0.3, size=3.5,color="black")
print(p)