The mHap map of common cancers: Identification of MHBs in cancers

loading required packages:

library(tidyverse)
library(stringr)
library(data.table)
library(ggplot2)
library(pheatmap)
library(ChIPseeker)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(LOLA)
library(GenomicRanges)
library(reshape2)
library(regioneR)

Identification of MHBs

The DNA Methylation haplotype blocks (MHBs) were identified by mHapSuite.

java -jar mHapSuite-2.0-jar-with-dependencies.jar MHBDiscovery \
                                  -mhapPath BRCA.mhap.gz \
                                  -cpgPath hg19_CpG.gz \
                                  -tag MHB_BRCA  \
                                  -outputDir MHB/tumor

Figure 1C

cancer_type = list.files("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/",pattern=".bed")
counts <- lapply(cancer_type,function(x) {
            fread(paste0("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/",x),sep="\t") %>% nrow()})
names(counts) <- cancer_type %>% str_split_i("[_.]",i=2)
Ncount=as.data.frame(do.call(rbind,counts)) %>% rownames_to_column(var="Type")

p<-ggplot(Ncount,aes(Type,V1)) + 
    geom_bar(stat="identity",fill="steelblue",width=0.8)+
    scale_y_continuous(expand=c(0,0),limits=c(0,30000))+
    labs(y="The Number of MHBs")+
    theme_bw()+
    theme(panel.grid= element_blank(),
          panel.background=element_blank(),
          panel.border=element_blank(),
          axis.line=element_line(color="black"),
          axis.title.x=element_blank(),
          axis.text.x = element_text(color="black",size=10, angle=45,vjust=1,hjust=1),
          axis.text.y = element_text(color="black",size=10),
          axis.ticks = element_line(color="black"),
          axis.ticks.length = unit(5, "pt") )+
    geom_text(aes(label=V1), vjust=-0.3, size=3.5,color="black")
print(p)

Figure 1D

txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene

mhb_clusters_Anno <- annotatePeak("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed.gz",
                                  tssRegion=c(-2000, 2000),
                                  TxDb=txdb, 
                                  annoDb="org.Hs.eg.db")

>> loading peak file...              2024-11-03 12:55:29 PM 
>> preparing features information...         2024-11-03 12:55:30 PM 
>> identifying nearest features...       2024-11-03 12:55:31 PM 
>> calculating distance from peak to TSS...  2024-11-03 12:55:33 PM 
>> assigning genomic annotation...       2024-11-03 12:55:33 PM 
>> adding gene annotation...             2024-11-03 12:55:47 PM 
>> assigning chromosome lengths          2024-11-03 12:55:47 PM 
>> done...                   2024-11-03 12:55:47 PM

plotAnnoPie(mhb_clusters_Anno)

Figure 1E

mhb_CT_matrix = read.table("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_CT_matrix.txt.gz",sep = "\t",header=T)
names(mhb_CT_matrix) = gsub("MHB_","",names(mhb_CT_matrix))
mhb_CT_matrix = mhb_CT_matrix[c(1:6,11:31,8:10,7)]
mhb_CT_matrix=mhb_CT_matrix[order(mhb_CT_matrix$num,mhb_CT_matrix$cluster,decreasing=F),]  
# transform 1-0 matrix
mhb_CT_matrix[-c(1:6)][mhb_CT_matrix[-c(1:6)]>=1]<-1
rownames(mhb_CT_matrix) = paste(mhb_CT_matrix$chr,mhb_CT_matrix$start,mhb_CT_matrix$end,sep="_")
# Ordering function
order_f <-function(x,Tag,cluster){
    num = table(x[Tag][x$cluster==paste0(cluster,"_cluster"),])
    x[Tag][x$cluster==paste0(cluster,"_cluster"),]<-c(rep(0,num["0"]),rep(1,num["1"]))
    data = x
  return(data)
}

# Public GEO data : CRC_P_RRBS ESCC_P LIHC_P
mhb_CT_matrix = order_f(mhb_CT_matrix,"CRC_P","COAD")
mhb_CT_matrix = order_f(mhb_CT_matrix,"ESCC_P","ESCA")
mhb_CT_matrix = order_f(mhb_CT_matrix,"LIHC_P","LIHC")
# Our previous data : Cancer of Unknown Primary data (CUP)
cup<-c("BRCA","CESC","COADREAD","ESCA","HNSC","LIHC","LUSCLUAD","OV","STAD","THCA")
for (tag in cup) {
   if (tag == "COADREAD" ) {
         flag <- "COAD"
   }else if (tag == "LUSCLUAD") {
         flag <- "NSCLC"
   }else{
         flag <- tag
   }
     mhb_CT_matrix = order_f(mhb_CT_matrix,paste0("CUP_",tag),flag)
}

# MHB Map plot
annotation_row = mhb_CT_matrix[c(6)]
annotation_col = data.frame(Assay = factor(c(rep("WGBS",11),rep("RRBS",11),rep("WGBS",3))), 
                                          Tissue_Type = factor(c(rep("Tumor",24),rep("Normal",1))))
rownames(annotation_col) = names(mhb_CT_matrix[-c(1:6)])
annotation_colors = list(Assay=c(WGBS="#4DAF4A",RRBS="#984EA3"),
                        Tissue_Type=c(Tumor="#E41A1C",Normal="#377EB8"),
                        cluster=c(BRCA_cluster="#A6CEE3",CESC_cluster="#1F78B4",
                        COAD_cluster="#B2DF8A",ESCA_cluster="#FDBF6F",
                        HNSC_cluster="#33A02C",LIHC_cluster="#FB9A99",
                        NSCLC_cluster="#E31A1C",OV_cluster="#FF7F00",
                        PACA_cluster="#CAB2D6", STAD_cluster="#6A3D9A",
                        THCA_cluster="#B15928",cluster12="#1B9E77",
                        cluster13="#D95F02",cluster14="#7570B3",
                        cluster15="#E7298A",cluster16="#66A61E"))
data = as.data.frame(mhb_CT_matrix[-c(1:6)])

p <- pheatmap(data,show_rownames=F,cluster_cols= F,cluster_row=F,
                annotation_legend=T,legend=T,
                annotation_row=annotation_row,annotation_col=annotation_col,
                annotation_colors=annotation_colors,
                annotation_names_row=T,border = F,border_color = F,
                gaps_col = c(11,21,24),scale = "none",angle_col = 45,
                color = colorRampPalette(colors = c("white","red"))(1000)
              )
print(p)

Figure 1F

mhb_T="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/mhb_clusters/"
background="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed.gz"
files = list.files(mhb_T)
locResults <- lapply(files,function(i){
            # Read input
            mhb=readBed(paste0(mhb_T,i))
            # Universe_Sets Background
            universe_set=fread(background) %>% toGRanges()
            # Overlapped normal MHB
            states=loadRegionDB("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/LOLA_Normal_DB")
            # Runing LOLA
            runLOLA(mhb, universe_set, states, cores=10)
            })
names(locResults) <- files

data <- lapply(files,function(x){ locResults[[x]][,c("filename","pValueLog")] %>% mutate(Type=x)})
data = as.data.frame(do.call(rbind,data))
dta = dcast(data,Type~filename,value.var = "pValueLog")
rownames(dta) = dta$Type;dta$Type=NULL
dta = dta[c("BRCA_cluster.bed","CESC_cluster.bed","COAD_cluster.bed",
            "ESCA_cluster.bed","HNSC_cluster.bed","LIHC_cluster.bed",
            "NSCLC_cluster.bed","OV_cluster.bed","PACA_cluster.bed",
            "STAD_cluster.bed", "THCA_cluster.bed","cluster12","cluster13",
            "cluster14","cluster15","cluster16"),]
# filter P value < 0.05
dta[dta< -log10(0.05)]<-NA
# enrichemnt ranking
rank_t = function(x) {
      m = rank(array(-x),na.last="keep",ties.method = "min")
      return(m)
  }
data_rank= as.data.frame(t(apply(dta,1,rank_t)))
names(data_rank) = names(dta)
tag<-names(data_rank)[-c(9)]
data_rank<- data_rank[c(tag,"MHB_placenta_normal.bed")]

# The enrichment rank of Cancer MHBs vs. Normal MHBs 
p<- pheatmap(data_rank,
                main="Cancer MHBs clusters vs. Normal MHBs",
                show_colnames=T,show_rownames=T,
                cluster_rows=F,cluster_cols=F,
                scale="none",angle_col="90",
                color=colorRampPalette(colors = c("firebrick","white","darkblue"))(1000)
                )
print(p)