Validation of colon cancer MHBs with single-cell data

Figure 4. Validation of cancer MHBs and their associated regulatory role using single-cell data. (A) Schematic of single-cell DNA methylation haplotype (sc-mHap). Top: Binary classification of CpG mean methylation (methylated: >0.9; unmethylated: < 0.1) to generate sc-mHaps. Bottom: Representative methylation haplotype patterns across 15 single cells. (B) An example of COAD MHB that was also identified in region (chr1:3,229,375-3,230,473) using CRC single-cell data. (C) Overlap of MHBs identified across different single-cell sample types. (D) Enrichment analysis of primary tumor single-cell MHBs against bulk sample MHBs from 11 cancer types using LOLA, with pan-cancer MHBs as background. (E) Association between gene expression and presence of MHBs in promoter regions. (F) Impact of MHB methylation levels on gene expression in intermediate and low methylation promoter groups. (G) Association of MHBs and dysregulation of gene expression in single-cell CRC. (H) Venn diagram showing the number of up-regulated genes in PT and LN tumor compared to normal colon tissues that harbor MHBs but lack DMRs in the promoter regions. (I) Pathway enrichment of 42 shared MHB-related genes in PT and LN.(J) A heatmap shows the mean methylation and expression of 42 shared genes that also contains MHBs across single cell. The genes annotated in MYC targets and G2/M checkpoint pathways were colored in red. (K and J) The DNA methylation and gene expression profiles of the 10 genes highlighted in panel J were validated using the TCGA-COAD dataset at different pathological stages. (L) The Kaplan-Meier survival curves comparing disease free survival between patient subgroups stratified by the high/low expression (median cutoff) of CBX3 gene in TCGA-COAD dataset.

loading required packages:

Identification of single-cell CRC MHBs

The single cell DNA Methylation haplotype blocks (scMHBs) were identified by mHapSC.

# cell barcode
bcFile="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/annotation/GSE97693_scMethyl_tumor_id.txt"    
# 1000 CpGs interval
bed="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/annotation/hg19_1000CpG.bed"
java -jar mHapSc-1.0-jar-with-dependencies.jar MHBDiscovery -mHapPath $mhap -bedFile $bed \
            -cpgPath hg19_CpG.gz  -bcFile $bcFile -window 5 -r2 0.5 -pvalue 0.05 \
            -outputDir ./ -tag CRC_Tumor_MHB 

Figure 4D

# Primary single-cell CRC MHB 
scMHB_Tumor="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/MHB/CRC_MHB_PT.bed.gz"
background="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed.gz"
# do LOLA enrichment
      mhb=fread(scMHB_Tumor) %>% toGRanges()
      # Universe_Sets Background
      universe_set=fread(background) %>% toGRanges()
      # regionDB
      states=loadRegionDB("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/scLOLA/")
      # Runing LOLA
      locResults = runLOLA(mhb, universe_set, states, cores=10) 
CI <- locResults %>% dplyr::select(support,b,c,d) %>% apply(1,function(x){
               CI.low <-  matrix(x,ncol=2,nrow=2) %>% fisher.test() %>% broom::tidy() %>% pull(3)
               CI.high <-  matrix(x,ncol=2,nrow=2) %>% fisher.test() %>% broom::tidy() %>% pull(4)
                  return(c(CI.low,CI.high))}) %>% 
            t() %>% as.data.frame() %>% 
            rename_with(~c("CI.low","CI.high"))
locResults <- locResults %>% cbind(CI) %>% mutate(OVR=100*round(support/size,4),
                                                logFDR=ifelse(qValue==0,300,-log10(qValue)),
                                                filename=fct_rev(fct_inorder(str_split_i(filename,"[_.]",i=2))))
# PLOT
p<- ggplot(locResults,aes(x=oddsRatio,y=filename,col=logFDR)) + 
        geom_point(aes(size=OVR,col=logFDR),shape=18) +
        geom_errorbarh(aes(xmax = CI.high, xmin = CI.low), height = 0,size=1)+
        scale_color_gradient2(midpoint=0.5*max(locResults$logFDR),space ="Lab",
                              low="steelblue",mid="orange",high="firebrick") + 
        geom_vline(aes(xintercept = 1),color="gray",linetype="dashed", size = 0.5)+
        scale_x_continuous(limits=c(0.1, 3.2), breaks = seq(0, 3.5, 0.5))+
        xlab('Odds Ratio') + ylab(" ") + 
        theme_classic() +
        theme(legend.position = "top",
              axis.text = element_text(size=15,color="black"),
              axis.title.x = element_text(size=20,color="black"),
              axis.ticks.length=unit(0.2, "cm") )
print(p)

Figure 4E

# Step 1: promoter methylation
scMethyl_Mcounts <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/Methyl/promoter_methylated.txt.gz",
                    header=T) %>% distinct() %>% column_to_rownames(var="region")
scMethyl_Tcounts <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/Methyl/promoter_total.txt.gz",
                        header=T) %>% distinct() %>% column_to_rownames(var="region")
# filter Promoter Total covered reads count <3
scMethyl_Tcounts[scMethyl_Tcounts<3]<- NA
scMethyl <- scMethyl_Mcounts/scMethyl_Tcounts 
scMethyl <- scMethyl %>% rownames_to_column(var="region") %>% separate(region,into=c("chr","start","end"),
                         sep="[:-]") %>%  mutate(start=as.numeric(start)-1,end=as.numeric(end))
# promoter anno
Promoter <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig3/ESCC/Methy/GREATv4.genes.promoter_UD1000.hg19.bed.gz") %>% 
            dplyr::rename_with(~c("chr","start","end","strand","SYMBOL"))
scMethyl <- Promoter %>% inner_join(scMethyl,by=c("chr","start","end"))
# toGR
scMethyl.GR <- makeGRangesFromDataFrame(scMethyl,keep.extra=T)
# sample id  with Paired RNA & Methylation 
id <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/annotation/GSE97693_scRNA_scMethyl_matched_info.txt.gz",
            header=T) %>% dplyr::select(ID,patients,lesions,assay,tissue,tag) %>% 
            mutate(Methyl_ID = str_split_i(ID,",",i=1),Exp_ID=str_split_i(ID,",",i=2))
# Promoters overlap with MHBs
tag=c("LN","PT","ML","MP","Tumor")
scMHB_Methyl <- pbapply::pblapply(tag,function(x){
    path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/MHB/"
    MHB<-fread(paste0(path,"CRC_MHB_",x,".bed.gz")) %>% toGRanges()
    # overlapping
    Tx <- findOverlaps(scMethyl.GR,MHB,ignore.strand=T)
    a<- rep(NA,length(scMethyl.GR))
    # MHB 
    a[unique(queryHits(Tx))]<-"T"
    a[is.na(a)]<-"N"
    a<- a %>% as.data.frame()
    names(a)<-"MHB"
    # sample GSM ID
    if(x!="Tumor"){
        sid <- id %>% filter(str_detect(tissue,x)) %>% pull(Methyl_ID)
    }else{
        sid <- id %>% pull(Methyl_ID)
    }  
    # dplyr::select
    mcols(scMethyl.GR) %>% as.data.frame() %>% 
                            dplyr::select(SYMBOL,all_of(sid)) %>%
                            cbind(a) %>% separate_rows(SYMBOL,sep=",")
  })
names(scMHB_Methyl) <-tag

# Step 2: scRNA + Methylation
load("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/Exp/GSE97693_scRNA_TPM_FPKM.RData")

scMme <- pbapply::pblapply(tag,function(x){
    # cat(x,"\n")
    # methylation id
    if(x!="Tumor"){
        mid <- id %>% filter(str_detect(tissue,x)) %>% pull(Methyl_ID) 
    }else{
        mid <- id %>% pull(Methyl_ID)
    }  
    # methylation data reshape
    data_methyl <- scMHB_Methyl[[x]] %>% melt() %>% drop_na() %>% nest(.by=variable) %>% 
                                          dplyr::mutate(MM = lapply(data,function(y){
                                             y %>% mutate(type=ifelse(value>=0.8,"high",
                                                    ifelse(value<=0.2,"low","intermediate"))) 
                                                              }))
    # show the paired methylation & expression ,one by one
    SME <- pbapply::pblapply(mid,function(m) {
            # mean vs median
              if(1){
                ExM <- function(z) mean(z, na.rm = TRUE)
              }else{
                ExM <- function(z) median(z, na.rm = TRUE)
              }
            # cat("Processing",m,"\n")
            # Methylation
            data <-  data_methyl  %>% filter(variable==m) %>% pull(MM) %>% as.data.frame() %>% 
                                group_by(MHB,type) %>% 
                                transmute(genes=paste0(SYMBOL, collapse=",")) %>% distinct()
            # Exp
            # test the RNA data with paired Exp id
            eid <- id %>% filter(str_detect(Methyl_ID,m)) %>% pull(Exp_ID) 
                if(sum(str_detect(names(RNA.TPM),eid))==1){
                    RNA <- RNA.TPM
                    # cat("RNA_TPM ",x, " ",m, " ", eid,"\n")
                    flag<-"TPM"
            }else{
                    RNA <- RNA.FPKM
                    # cat("RNA_FPKM ",x, " ",m, " ", eid,"\n")
                    flag <-"FPKM"
            }
            # 6 groups
            # 1.high_mhb
            Hm<- data %>% filter(MHB=="T" & type=="high") %>% pull(genes) %>% str_split_1(pattern=",")
            HmE<- RNA %>% dplyr::select(contains(eid)) %>% filter(rownames(.) %in% Hm) %>% pull(1)
            HmE <- log2(ExM(HmE) +1)
            # 2.high_nonmhb
            Hnm<- data %>% filter(MHB=="N" & type=="high") %>% pull(genes) %>% str_split_1(pattern=",")
            HnmE<-RNA %>% dplyr::select(contains(eid)) %>% filter(rownames(.) %in% Hnm) %>% pull(1)
            HnmE <- log2(ExM(HnmE) +1)
            # 3.intermediate_mhb
            Im<- data %>% filter(MHB=="T" & type=="intermediate") %>% pull(genes) %>% str_split_1(pattern=",")
            ImE<-RNA %>% dplyr::select(contains(eid)) %>% filter(rownames(.) %in% Im) %>% pull(1)
            ImE <- log2(ExM(ImE) +1)
            # 4.intermediate_nonmhb
            Inm<- data %>% filter(MHB=="N" & type=="intermediate") %>% pull(genes) %>% str_split_1(pattern=",")
            InmE<-RNA %>% dplyr::select(contains(eid)) %>% filter(rownames(.) %in% Inm) %>% pull(1)
            InmE <- log2(ExM(InmE) +1)
            # 5.low_mhb
            Lm<- data %>% filter(MHB=="T" & type=="low") %>% pull(genes) %>% str_split_1(pattern=",")
            LmE<-RNA %>% dplyr::select(contains(eid)) %>% filter(rownames(.) %in% Lm) %>% pull(1)
            LmE <- log2(ExM(LmE) +1)
            # 6.low_nonmhb
            Lnm<- data %>% filter(MHB=="N" & type=="low") %>% pull(genes) %>% str_split_1(pattern=",")
            LnmE<-RNA %>% dplyr::select(contains(eid)) %>% filter(rownames(.) %in% Lnm) %>% pull(1)
            LnmE <- log2(ExM(LnmE) +1)
                            
            # the Exp of methylation level in one cell
            c(HmE,HnmE,ImE,InmE,LmE,LnmE,flag)
    })
        names(SME) <- mid
        # cancer type : promoter methyltion + MHB +  Expression 
        CME <- do.call(rbind,SME) %>% as.data.frame() %>%
                    dplyr::rename_with(~c("HmE","HnmE","ImE","InmE","LmE","LnmE","Ex")) %>%
                     mutate(ID=rownames(.))
  })

names(scMme) <- tag 
# sc_MMET: Methylation + MHB + Exp
GSE97693_scMMET <- scMme

# Step 3: PLOT
n=0
for ( x in tag ){

    # cat(x,"\n")
    n = n + 1
    if (!is.null(GSE97693_scMMET[[x]])){   ###check is null

        data <- GSE97693_scMMET[[x]] %>% as.data.frame() %>% 
                gather("Type","Exp",-c(ID,Ex)) %>% 
                mutate(Me=str_sub(Type,0,1),MHB=toupper(str_sub(Type,2,2)),
                                                          Exp=as.numeric(Exp),Ex=fct_inorder(Ex))
        
        ## plot
        compare_means(Exp ~ MHB, data = data, group.by = c("Me","Ex"))
        assign(paste0("p",n),ggboxplot(data, x = "Me", y = "Exp", color = "MHB",
                        palette ="nejm",width=0.8,
                        bxp.errorbar = T,
                        add = "jitter",
                        add.params = list(size = 0.05),
                        facet.by = "Ex", short.panel.labs = FALSE
                        ) +
                        xlab("Methylation-level")+ 
                        ylab("Expression")+
                        ggtitle(ifelse(x=="MHB_T","Tumor",x))+ 
                        stat_compare_means(aes(group = MHB),size=2) +
                        theme_bw() +    
                        theme(panel.grid.major=element_line(colour=NA),
                        panel.background = element_rect(fill = "transparent",colour = NA),
                        plot.background = element_rect(fill = "transparent",colour = NA),
                        panel.grid.minor = element_blank(),
                        axis.text.x = element_text(angle = 45,vjust = 1,hjust = 1),
                        plot.title = element_text(hjust = 0.5, size = 18),
                        panel.border = element_rect(fill=NA,color="black", size=1, linetype="solid"))
             )
        #     cat(n,"\n")
    }
}
# paste to one figure 
p <- wrap_plots(p1, p2, p3, p4,p5)
print(p)

Figure 4F

scMethyl_Mcounts <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/Methyl/promoter_methylated.txt.gz",
                    header=T) %>% distinct() %>% column_to_rownames(var="region")
scMethyl_Tcounts <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/Methyl/promoter_total.txt.gz",
                    header=T) %>% distinct() %>% column_to_rownames(var="region")

# filter promoter total reads count <3 
scMethyl_Tcounts[scMethyl_Tcounts<3]<- NA
scMethyl <- scMethyl_Mcounts/scMethyl_Tcounts 
scMethyl <- scMethyl %>% rownames_to_column(var="region") %>% separate(region,
                        into=c("chr","start","end"),sep="[:-]") %>% 
                        mutate(start=as.numeric(start)-1,end=as.numeric(end))
# promoter anno
Promoter <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig3/ESCC/Methy/GREATv4.genes.promoter_UD1000.hg19.bed.gz") %>%
            dplyr::rename_with(~c("chr","start","end","strand","SYMBOL"))
scMethyl <- Promoter %>% inner_join(scMethyl,by=c("chr","start","end"))
# toGR
scMethyl.GR <- makeGRangesFromDataFrame(scMethyl,keep.extra=T)
# sample id  with Paired RNA & Methylation 
id <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/annotation/GSE97693_scRNA_scMethyl_matched_info.txt.gz",
            header=T) %>% dplyr::select(ID,patients,lesions,assay,tissue,tag) %>% 
            mutate(Methyl_ID = str_split_i(ID,",",i=1),Exp_ID=str_split_i(ID,",",i=2))
# promoter overlaps with MHB
tag=c("LN","PT","ML","MP","Tumor")
scMHB_Methyl <- pbapply::pblapply(tag,function(x){
    path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/MHB/"
    MHB<-fread(paste0(path,"CRC_MHB_",x,".bed.gz")) %>% toGRanges()
    # overlapping 
    Tx <- findOverlaps(scMethyl.GR,MHB,ignore.strand=T)
    a<- rep(NA,length(scMethyl.GR))
    # MHB 
    a[unique(queryHits(Tx))]<-"T"
    a[is.na(a)]<-"N"
    a<- a %>% as.data.frame()
    names(a)<-"MHB"
    # sample GSM ID
    if(x!="Tumor"){
        sid <- id %>% dplyr::filter(str_detect(tissue,x)) %>% pull(Methyl_ID)
    }else{
        sid <- id %>% pull(Methyl_ID)
    }  
    # select
    mcols(scMethyl.GR) %>% as.data.frame() %>% 
                            dplyr::select(SYMBOL,all_of(sid)) %>%
                            cbind(a) 
})
names(scMHB_Methyl) <-tag

# Step 2: scRNA + Methylation
load("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/Exp/GSE97693_scRNA_TPM_FPKM.RData")

scMme <- pbapply::pblapply(tag,function(x) {
    # cat(x,"\n")
    # methylation id
    if(x!="Tumor"){
        mid <- id %>% filter(str_detect(tissue,x)) %>% pull(Methyl_ID) 
    }else{
        mid <- id %>% pull(Methyl_ID)
    }  
    # methylation data reshape
    data_methyl <- scMHB_Methyl[[x]] %>% melt() %>% drop_na() %>% nest(.by=variable) %>% 
                                          dplyr::mutate(MM = lapply(data,function(y){
                                        y %>% mutate(type=ifelse(value>=0.8,"high",
                                                    ifelse(value<=0.2,"low","Intermediate")))
                                                              }))
# MHB Methylation
scMethyl_MHB_Mcounts <- fread(paste0("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/Methyl/CRC_MHB_",
                            x,"_methylated.txt.gz"),header=T) %>% 
                            distinct() %>% column_to_rownames(var="region")
scMethyl_MHB_Tcounts <- fread(paste0("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/Methyl/CRC_MHB_",
                            x,"_total.txt.gz"),header=T) %>% 
                            distinct() %>%column_to_rownames(var="region")
# filter MHB with Total reads count <3
scMethyl_MHB_Tcounts[scMethyl_MHB_Tcounts<3]<- NA
scMethyl_MHB <- scMethyl_MHB_Mcounts/scMethyl_MHB_Tcounts 

scMethyl_MHB.GR <- scMethyl_MHB  %>% rownames_to_column(var="region") %>% 
                        separate(region,into=c("chr","start","end"),sep="[:-]") %>% 
                        mutate(start=as.numeric(start)-1,end=as.numeric(end)) %>% 
                        toGRanges() %>% great(cores=10,"MSigDB:H", "GREAT:hg19") %>% 
                        getRegionGeneAssociations() %>% 
                        as.data.frame() %>%  mutate(annotated_genes=lapply(annotated_genes,function(x){
                        x %>% paste(collapse=",")
                        }) %>% do.call(rbind,.) %>% as.character(),
                        dist_to_TSS = lapply(dist_to_TSS,function(x){
                                            x %>% paste(collapse=",")
                        }) %>% do.call(rbind,.) %>% as.character()) %>% 
                        separate_rows(annotated_genes,dist_to_TSS,sep=",",convert=TRUE)  %>%
                        filter(abs(dist_to_TSS)< 1000)  %>% 
                        dplyr::select(contains("GSM"),annotated_genes)  %>% 
                        group_by(annotated_genes) %>% summarise_each(funs(mean))
        
    # show the paired methylation & expression one by one
    SME <- pbapply::pblapply(mid,function(m) {
            # mean vs median
              if(1){
                ExM <- function(z) mean(z, na.rm = TRUE)
              }else{
                ExM <- function(z) median(z, na.rm = TRUE)
              }
            # cat("Processing",m,"\n")
            # Methylation
            data_Me <-  data_methyl  %>% filter(variable==m) %>% pull(MM) %>% as.data.frame() 
            data_MHB <-  scMethyl_MHB.GR %>% dplyr::select(c(m,"annotated_genes")) %>% 
                          rename_with(~c("value","SYMBOL")) %>% mutate(type=ifelse(value>=0.8,"high",
                                                        ifelse(value<=0.2,"low","intermediate"))) %>%
                          group_by(type) %>% summarise(genes=paste0(SYMBOL, collapse=",")) %>% na.omit()
            # Exp
            ## test the RNA data  with paired Exp id
                eid <- id %>% filter(str_detect(Methyl_ID,m)) %>% pull(Exp_ID) 
                if(sum(str_detect(names(RNA.TPM),eid))==1){
                    RNA <- RNA.TPM
                #    cat("RNA_TPM ",x, " ",m, " ", eid,"\n")
                    flag<-"TPM"
                }else{
                    RNA <- RNA.FPKM
                #    cat("RNA_FPKM ",x, " ",m, " ", eid,"\n")
                    flag <-"FPKM"
                }
                # two groups
                # 1. promoter intermediate methylation 
                # MHB high  methylation 
                MHB_h <- data_MHB %>% filter(type=="high") %>% pull(genes) %>% str_split_1(pattern=",") 
                # MHB low  methylation
                MHB_l <- data_MHB %>% filter(type=="low") %>% pull(genes) %>% str_split_1(pattern=",") 

                data_Me1 <- data_Me %>% mutate(MHB_Methyl = ifelse(SYMBOL %in% MHB_h ,"High",
                                                ifelse(SYMBOL %in% MHB_l,"Low","Other")))  %>% 
                                        filter(MHB_Methyl!="Other",type!="high")
                
                IE <- RNA %>%  dplyr::select(contains(eid)) %>% rownames_to_column(var="SYMBOL") %>% 
                         mutate(EID = eid,flag=flag) %>% inner_join(data_Me1,by="SYMBOL") %>% 
                         dplyr::rename_with(~c("SYMBOL","Exp","EID","flag","MHB","TSS_MM","TSS_type","MHB_type")) %>% 
                         group_by(TSS_type,EID,MHB_type,flag) %>% 
                         summarise(Exp=ExM(Exp),ncount=length(SYMBOL)) 
    })
     
          names(SME) <- mid
          # cancer type : promoter methyltion + MHB + Expression 
           CME <- do.call(rbind,SME) %>% as.data.frame() %>%
                  dplyr::rename_with(~c("TSS_type","EID","MHB","flag","Exp","ncount"))
                    
})
names(scMme) <- tag 
# sc_MMET: Tumor Methylation + MHB + Exp
GSE97693_scME_MHBGene <- scMme 

# Step3: PLOT
n=0
for ( x in tag ){

    # cat(x,"\n")
    n = n + 1
    if (!is.null(GSE97693_scME_MHBGene[[x]])){   ###check is null

      data <- GSE97693_scME_MHBGene[[x]] %>% as.data.frame() %>% mutate(Exp=log2(Exp+1))
      ## plot
        compare_means(Exp ~ MHB, data = data, group.by = c("TSS_type","flag"))
        assign(paste0("p",n),ggboxplot(data, x = "TSS_type", y = "Exp",color="MHB",
                        palette ="nejm",width=0.8,
                        bxp.errorbar = T,
                        add = "jitter",
                        add.params = list(size = 0.1),
                        facet.by = c("flag"), short.panel.labs = FALSE
                        ) +
                        xlab("Promoter Methylation-level")+ 
                        ylab("Expression")+
                        ggtitle(ifelse(x=="Tumor","Tumor",x))+ 
                        stat_compare_means(aes(group = MHB),size=2) +
                        theme_bw() +    
                        theme(panel.grid.major=element_line(colour=NA),
                        panel.background = element_rect(fill = "transparent",colour = NA),
                        plot.background = element_rect(fill = "transparent",colour = NA),
                        panel.grid.minor = element_blank(),
                        axis.text.x = element_text(angle = 45,vjust = 1,hjust = 1),
                        plot.title = element_text(hjust = 0.5, size = 18),
                        panel.border = element_rect(fill=NA,color="black", size=1, linetype="solid"))
             )
            # cat(n,"\n")
    }
}

# paste to one figure 
p <- wrap_plots(p1, p2, p3, p4,p5)
print(p)

Figure 4I

# Step1: Single-cell Expression
load("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/Exp/GSE97693_scRNA_TPM_FPKM.RData")
# dplyr::select the single cell with TPM data. (only with PT, LN, and Normal cell)
    if(0){
        RNA <- RNA.FPKM
       # cat("RNA FPKM ","\n")
    }else{
        RNA <- RNA.TPM
       # cat("RNA TPM ","\n")
    }
names(RNA) <- str_split_i(names(RNA),"_",i=1)
# load meta data
id <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/annotation/GSE97693_Anno.txt.gz",header=T) %>% 
      dplyr::select(ID,patients,lesions,assay,tissue,tag) %>% filter(tissue!="HeLa") 
id.RNA <- id %>% filter(ID  %in% names(RNA)) 
# Find DEG Markers
findDEG <- function(x,Idents) {
    # annotation
    treat <- id.RNA %>% dplyr::filter(str_detect(tissue,Idents)) %>% pull(ID)
    nc <- id.RNA %>% dplyr::filter(tissue=="NC") %>% pull(ID)
    # wilcoxon test
    z <- function(z) {
    d1 <- z[treat]%>% as.character() %>% as.numeric()
    d2 <- z[nc] %>% as.character() %>% as.numeric()

    p <-  wilcox.test(d1,d2)$p.value
    group1 <- mean(d1)
    group2 <- mean(d2)
    logFC <- log2(group1+1) - log2(group2+1)  
    
    c(group1,group2,logFC,p)
    }
    
    res <- t(apply(x,1,z)) %>% as.data.frame() %>% rename_with(~c("Treat","NC","logFC","p_value")) %>% 
            mutate(p_val_adj=p.adjust(p_value,method="fdr",length(p_value)),
                   DEG = ifelse(abs(logFC)>=1 & p_val_adj<=0.05,"Yes","No"))
    return(res)
}

PT.markers <- findDEG(RNA,Idents="PT") %>% rownames_to_column(var="SYMBOL")
LN.markers <- findDEG(RNA,Idents="LN") %>% rownames_to_column(var="SYMBOL")

# Step2: promoter DNA methylation
scMethyl_Mcounts <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/Methyl/promoter_methylated.txt.gz",
                            header=T) %>% distinct() %>% column_to_rownames(var="region")
scMethyl_Tcounts <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/Methyl/promoter_total.txt.gz",
                            header=T) %>% distinct() %>% column_to_rownames(var="region")
# filter promoters with Total reads count <3
scMethyl_Tcounts[scMethyl_Tcounts<3]<- NA
scMethyl <- scMethyl_Mcounts/scMethyl_Tcounts 
scMethyl <- scMethyl %>% rownames_to_column(var="region") %>% 
                            separate(region,into=c("chr","start","end"),sep="[:-]") %>% 
                            mutate(start=as.numeric(start)-1,end=as.numeric(end))
# promoter anno
Promoter <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig3/ESCC/Methy/GREATv4.genes.promoter_UD1000.hg19.bed.gz") %>%
                            dplyr::rename_with(~c("chr","start","end","strand","SYMBOL"))
scMethyl <- Promoter %>% inner_join(scMethyl,by=c("chr","start","end"))
scMethyl.GR <- makeGRangesFromDataFrame(scMethyl,keep.extra=T)
# promoter overlap with MHB 
tag=c("PT","LN")
scMHB_Methyl <- pbapply::pblapply(tag,function(x){
    
    path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/MHB/"
    MHB<-fread(paste0(path,"CRC_MHB_",x,".bed.gz")) %>% toGRanges()
    # overlapping
    Tx <- findOverlaps(scMethyl.GR,MHB,ignore.strand=T)
    a<- rep(NA,length(scMethyl.GR))
    # MHB 
    a[unique(queryHits(Tx))]<-"T"
    a[is.na(a)]<-"N"
    a<- a %>% as.data.frame()
    names(a)<-"MHB"
    # sample GSM ID paired data
    sid <- id %>% dplyr::filter(str_detect(tissue,x),assay=="Met") %>% pull(ID)
    nc <- id %>% dplyr::filter(tissue=="NC",assay=="Met") %>% pull(ID)

    ids <- c(sid,nc)
    # dplyr::select
    mcols(scMethyl.GR) %>% as.data.frame() %>% 
                            dplyr::select(SYMBOL,all_of(ids)) %>%
                            cbind(a) 
})
names(scMHB_Methyl) <-tag

# find Promoter DMR markers
# Remove NA
sum_of_na <- function(x){
  sum(is.na(x))
}
target_genes <- function(x) {
      Mm <- apply(x,1,sum_of_na) > 0.3*ncol(x)  # promoter:0.3, MHB:0.9
      genes <- x$SYMBOL[Mm!="TRUE"]
      return(genes)
}

# DMG function
findDMGenes <- function(dataset,Idents){
    # annotation
    treat <- id %>% dplyr::filter(str_detect(tissue,Idents),assay=="Met") %>% pull(ID)
    nc <- id %>% dplyr::filter(tissue=="NC",assay=="Met") %>% pull(ID)
    # dataset 1 & dataset 2 merging
    dataset1 <- dataset[[Idents]] %>% dplyr::select(SYMBOL,MHB,all_of(treat)) %>% 
                                    filter(SYMBOL %in% target_genes(.)) 
    dataset2 <- dataset[[Idents]] %>% dplyr::select(SYMBOL,all_of(nc)) %>% 
                                    filter(SYMBOL %in% target_genes(.))
    data <-  inner_join(dataset1,dataset2,by="SYMBOL") 
    # find DMG
    dmg <- function(x){

        d1 <- x[treat]%>% as.character() %>% as.numeric()
        d2 <- x[nc] %>% as.character() %>% as.numeric()

        p <-  wilcox.test(d1,d2)$p.value
        delta <- mean(d1,na.rm=T) - mean(d2,na.rm=T)
        res <- c(delta,p)
        return(res)
    }

    res <-  apply(data,1,dmg) %>% t() %>% as.data.frame() %>% 
            dplyr::rename_with(~c("delta","P.val")) %>% mutate(fdr=p.adjust(P.val,n=length(P.val))) %>% 
            cbind(data) %>% mutate(DMG = ifelse(abs(delta)>0.1 & fdr <= 0.05, "Yes","No")) %>%
            dplyr::select(SYMBOL,delta,P.val,fdr,DMG,MHB)
    # Return
    return(res)
}
DMG.PT<- findDMGenes(scMHB_Methyl,Idents="PT")
DMG.LN<- findDMGenes(scMHB_Methyl,Idents="LN")
# Identify DEGs + NonDMR + MHB Genes 
            # PT
            NonDMG.PT <-  DMG.PT  %>% filter(DMG=="No") %>% dplyr::select(SYMBOL,MHB)
            DEG.PT <-  PT.markers %>% filter(DEG=="Yes") %>% mutate(DE = ifelse(logFC >= 1,"UP","DN")) %>%
                        dplyr::select(SYMBOL,DE)
            data.PT<- inner_join(DEG.PT,NonDMG.PT,by="SYMBOL")
            # LN            
            NonDMG.LN <-  DMG.LN %>% filter(DMG=="No")  %>% dplyr::select(SYMBOL,MHB)
            DEG.LN <-  LN.markers %>% filter(DEG=="Yes") %>% mutate(DE = ifelse(logFC >= 1,"UP","DN")) %>%
                        dplyr::select(SYMBOL,DE)
            data.LN <- inner_join(DEG.LN,NonDMG.LN,by="SYMBOL")
            
            PT.MHB.NonDMR.Up <- data.PT %>% filter(DE=="UP",MHB=="T")  %>% pull(SYMBOL)
            LN.MHB.NonDMR.Up <-  data.LN %>% filter(DE=="UP",MHB=="T")  %>% pull(SYMBOL)

            geneset <-  intersect(PT.MHB.NonDMR.Up,LN.MHB.NonDMR.Up)
            test <- list("geneset"=geneset)
# saving 
write_gmt <- function(x,filename) {
    
    output<-file(filename,open = "wt")
    name <- names(x)
    lapply(name,function(y){
     writeLines(paste(c(y,"NA",x[[y]]),collapse = "\t"),con = output)
    
     })
    close(output)
}

write_gmt(test,filename="PT-LN.MHB.NonDMR.Up.gmt")
fwrite(as.data.frame(geneset),file="PT-LN.MHB.NonDMR.Up_geneset.txt",sep="\t",quote=F,row.names=F,col.names=F) 


# Step3: Geneset enrichment 
# Result From Msigdb website
msig <- data.frame(ID=c("HALLMARK_MYC_TARGETS_V1","HALLMARK_UNFOLDED_PROTEIN_RESPONSE",
                        "HALLMARK_G2M_CHECKPOINT"),qvalue = c(5.21e-8,2.33e-6,5.67e-4))
# PLOT
p <- ggplot(msig,aes(x=fct_rev(fct_inorder(ID)),y= -log10(qvalue))) +
     geom_bar(stat="identity",fill="steelblue") + ylim(0,10) +
     labs(x="",y="-Log10(FDR)",title="Enrichment of hallmark pathways") +
     coord_flip() + 
     theme_bw()+
     theme(panel.grid=element_blank(),
            panel.background =element_blank(),
            plot.title = element_text(hjust=0.5),
            axis.text = element_text(color="black"))
print(p)

Figure 4J

# Step 5: show the 42 common genes are shared by PT and LN.
# RNA merging with DNA methylation matrix
RNA.DEG.PT <- PT.markers %>% filter(DEG=="Yes") %>% filter(SYMBOL %in% geneset)  %>% 
                inner_join(DMG.PT,by="SYMBOL") %>% dplyr::rename("PT.FC"="logFC","PT.delta"="delta") %>%  
                dplyr::select(SYMBOL,PT.FC,PT.delta)
RNA.DEG.LN <- LN.markers %>% filter(DEG=="Yes") %>% filter(SYMBOL %in% geneset)  %>%
                inner_join(DMG.LN,by="SYMBOL") %>% dplyr::rename("LN.FC"="logFC","LN.delta"="delta") %>% 
                dplyr::select(SYMBOL,LN.FC,LN.delta)
RNA.DEG <- RNA.DEG.PT %>% inner_join(RNA.DEG.LN,by="SYMBOL")

log2.RNA <- log2(RNA+1) %>% rownames_to_column(var="SYMBOL")
Methyl <- scMethyl %>% dplyr::select(-c(1:4))
data <-  RNA.DEG %>% inner_join(log2.RNA,by="SYMBOL") %>% 
            inner_join(Methyl,by="SYMBOL")
# RNA flag
id.RNA.NC <- id.RNA %>% filter(tissue=="NC") %>% pull(ID)
id.RNA.PT <- id.RNA %>% filter(tissue=="PT") %>% pull(ID)
id.RNA.LN <- id.RNA %>% filter(tissue=="LN") %>% pull(ID)
id.RNA.tag <- c(id.RNA.NC,id.RNA.PT,id.RNA.LN)
# Methylation flag
id.Met.NC <- id %>% filter(tissue=="NC",assay=="Met") %>% pull(ID)
id.Met.PT <- id %>% filter(tissue=="PT",assay=="Met") %>% pull(ID)
id.Met.LN <- id %>% filter(tissue=="LN",assay=="Met") %>% pull(ID)
id.Met.ML <- id %>% filter(tissue=="ML",assay=="Met") %>% pull(ID)
id.Met.MP <- id %>% filter(tissue=="MP",assay=="Met") %>% pull(ID)
id.Met.tag <- c(id.Met.NC,id.Met.PT,id.Met.LN,id.Met.ML,id.Met.MP)

# In tissue-level
mhb_N_Tx <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/MHB/CRC_MHB_NC.bed.gz")  %>% 
            toGRanges() %>% 
            findOverlaps(scMethyl.GR,ignore.strand=T,type= "any")
mhb_N_genes <- mcols(scMethyl.GR[subjectHits(mhb_N_Tx)]) %>% as.data.frame() %>%  pull(SYMBOL) %>% unique()
## RNA
RNA.mean <- lapply(c("PT","LN","NC"),function(x){
     ID <-  id.RNA %>% filter(tissue==x) %>% pull(ID)
     RNA %>% dplyr::select(contains(ID)) %>% rowMeans(na.rm=T) %>% as.data.frame() %>% rename_with(~x)
})
RNA.data <- do.call(cbind,RNA.mean) %>% rownames_to_column(var="SYMBOL") %>% 
            filter(SYMBOL %in% geneset) %>% column_to_rownames(var="SYMBOL")
## Methylation
Met.mean <- lapply(c("PT","LN","ML","MP","NC"),function(x){
     ID <-  id %>% filter(tissue==x,assay=="Met") %>% pull(ID)
     scMethyl %>% dplyr::select(-c(1:4))  %>% dplyr::select(contains(ID)) %>% rowMeans(na.rm=T) %>% 
                as.data.frame() %>% rename_with(~x)
})
Met.data <- do.call(cbind,Met.mean) %>% as.data.frame() %>% cbind(scMethyl[,5]) %>% 
            filter(SYMBOL %in% geneset) %>% arrange(SYMBOL) %>% column_to_rownames(var="SYMBOL")
# PLOT 1 
## Methylation
column_ha_1 = HeatmapAnnotation(Assay = rep("WGBS",5) , Type = names(Met.data))
row_ha_1 = rowAnnotation(MHB_PT = rep(1,42),MHB_LN = rep(1,42), MHB_NC= sapply(rownames(Met.data),
                                                    function(x) {ifelse(x %in% mhb_N_genes,1,0)}))
colfun11 <- colorRamp2(breaks = c(0, 0.5, 1), colors = c('darkblue', 'white', 'firebrick'))
## RNA
column_ha_2 = HeatmapAnnotation(Assay = rep("RNA",3) , Type = names(RNA.data) )
colfun21 <- colorRamp2(breaks = c(-4, 0, 4), colors = c('darkblue', 'white', 'firebrick'))
data.RNA.scale = t(scale(t(as.matrix(RNA.data))))

p1 <- Heatmap(as.matrix(Met.data),name="Methylation",
                col=colfun11,cluster_columns= F,
                cluster_rows=F, left_annotation = row_ha_1,
                top_annotation = column_ha_1)
p2 <- Heatmap(as.matrix(data.RNA.scale ),name="TPM(scaled)",
                col=colfun21,cluster_columns= F,
                 cluster_rows=F,
                top_annotation = column_ha_2)
p<- p1+p2
print(p)

Figure 4K

# TCGA COAD validation MYC target & G2M  genes
# load DNAme data
load("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/Methyl/TCGA_Promoter_Methylation_UD1000.RData")
MM_COAD <- cbind(mcols(Mregion)["name"],rList$COAD)

# load expression data
load("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig5/TCGA_Exp/Firehose_Expression.RData")
load("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig5/TCGA_Exp/Firehose_SampleSheet.RData")
Exp_COAD <- Firehose_Expression$COAD
# clinical data 
COAD_clinical <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig5/TCGA_Exp/COAD.merged_only_clinical_clin_format.txt.gz") %>% 
                filter(V1 %in% c("patient.bcr_patient_barcode","patient.stage_event.pathologic_stage","patient.vital_status")) %>%
                column_to_rownames(var="V1") %>% t() %>% as_tibble() %>% 
                rename_with(~c("ID","Stage","status")) %>% 
                mutate(ID = str_to_upper(ID),Stage=str_replace_all(Stage,"stage ","") %>% str_to_upper())
# Combine DNAme & Exp & clinical data
match.id.COAD <- intersect(names(MM_COAD)[-1] %>% str_sub(1,16) ,colnames(Exp_COAD) %>% str_sub(1,16))
match.id.COAD <- data.frame("Tag" =match.id.COAD) %>% mutate(ID = str_sub(Tag,1,12)) %>% 
                left_join(COAD_clinical,by="ID") %>% 
                mutate(Stage=ifelse(str_detect(Stage,"III"),"III",
                              ifelse(str_detect(Stage,"II"),"II",
                              ifelse(str_detect(Stage,"IV"),"IV",
                              ifelse(str_detect(Stage,"I"),"I",NA)))),
                        Stage = ifelse(str_detect(Tag,"-11"),"Normal",Stage)) %>%
                        na.omit()
# signature selection
signature <- c("CBX3","DDX21","KARS","KIF5B","NOLC1","PSMA7","PSMB3","RANBP1","SLC12A2","TOP1")
signature.me <- lapply(c("I","II","III","IV","Normal"),function(x){
            tag <- match.id.COAD %>% filter(Stage==x) %>% pull(Tag) %>% str_replace_all("-",".")
            MM_COAD %>% as_tibble() %>% dplyr::select("name",contains(tag)) %>% 
            filter(name %in% signature) %>% arrange(name) %>% column_to_rownames(var="name") %>% 
            rowMeans() %>% as_tibble() %>% mutate(Tag = x,gene=signature)
})
signature.me <- do.call(rbind,signature.me) %>%as.data.frame() %>% 
                    pivot_wider(values_from="value",names_from="Tag") %>% 
                    column_to_rownames(var="gene")
signature.exp <- lapply(c("I","II","III","IV","Normal"),function(x){
           tag <- match.id.COAD %>% filter(Stage==x) %>% pull(Tag) 
           ## to TPM 
           temp <- Exp_COAD %>% as.data.frame() %>% dplyr::select(contains(tag)) %>% 
                    rownames_to_column(var="name") %>% 
                    filter(name %in% signature) %>% arrange(name) %>% column_to_rownames(var="name") 
           tpm  <- 2^(temp)-1 
           tpm  %>%  rowMeans() %>%  as_tibble() %>% mutate(Tag = x,gene=signature)
})
signature.exp <- do.call(rbind,signature.exp) %>%as.data.frame() %>% 
                 pivot_wider(values_from="value",names_from="Tag") %>% 
                 column_to_rownames(var="gene")
# PLOT 2
## Methylation
column_ha_1 = HeatmapAnnotation(Type = paste0("Stage ",names(signature.me)))
colfun11 <- colorRamp2(breaks = c(0, 0.5, 1), colors = c('darkblue', 'white', 'firebrick'))
## RNA
column_ha_2 = HeatmapAnnotation(Type = paste0("Stage ",names(signature.exp) ))
colfun21 <- colorRamp2(breaks = c(-2, 0, 2), colors = c('darkblue', 'white', 'firebrick'))
data.RNA.scale = t(scale(t(as.matrix(signature.exp))))

p1 <- Heatmap(as.matrix(signature.me),name="Methylation",
                col=colfun11,cluster_columns= F,
                cluster_rows=F, 
                top_annotation = column_ha_1)
p2 <- Heatmap(as.matrix(data.RNA.scale ),name="TPM(scaled)",
                col=colfun21,cluster_columns= F,
                 cluster_rows=F,
                top_annotation = column_ha_2)
p<- p1+p2
print(p)