Figure6. Characterization of DNA methylation inter-tumoral heterogeneity in cancers. (A) Boxplots showing Pearson correlation coefficients of average methylation between pairwise CpGs in TCGA 450K array data. Regions analyzed: pan-cancer MHBs (all cancer types), cancer-type-specific MHBs, CGIs, and random genomic regions.(B) Quantification of concordant methylation levels. Using COAD as an example, LOESS curves were fitted between mean methylation and variance for MHBs versus random regions. The AUC quantifies concordance (lower AUC = higher concordance).(C) Number of DEGs between tumors with lowest/ highest AUCs. Red: upregulated; blue: downregulated. (D) Enriched pathways in upregulated DEGs, hallmark pathways from MSigDB. (E) Volcano plot associating driver mutations with concordant methylation levels (measured by AUC). Statistical significance was calculated by student’s two-sided t-test (FDR < 0.05). Red: AUC increased; blue: AUC decreased. (F) Boxplots showing IDH1 mutation significantly associated with methylation concordance in TCGA dataset.(G) Independent validation of IDH1 mutation effect (GSE50774). Mean methylation-variance plots compare IDH1-mutant (n = 13) and wild-type (n = 13) samples.
loading required packages:
# 1. Load data
# Calculate the methylation correlation of HM450K probes from Random, CGI, and MHB regions
load("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig6/MHB_correlation/RnBeads_450K_hg19_Probes_GR.RData")
RnBeads_450K_hg19_GR = RnBeads_450K_hg19_GR[RnBeads_450K_hg19_GR$QC]
sourceFolder = "/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/TCGA_DNAm"
# CpG correlation
ArrayIMCorrelation <-function(MM,IntervalGR,cpgGR,Tag){
# check seqlevels
if(length(intersect(seqlevels(IntervalGR), seqlevels(cpgGR))) == 0)
stop("No shared seqlevels found.\n")
# only keep CpGs that locate in intervals
Idx1 = countOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE) > 0
Idx2 = names(cpgGR) %in% rownames(MM)
Idx = Idx1 & Idx2
if(sum(Idx) == 0)
stop("No CpGs found in pre-defined regions.\n")
cpgGR = cpgGR[Idx, ]
# keep shared CpGs
cNames = intersect(rownames(MM), names(cpgGR))
cpgGR = cpgGR[cNames]
MM = MM[cNames, ]
# Only select the tumor samples
MM = MM[,!grepl("11",substr(colnames(MM),14,15))]
# Sample-by-sample processing to control memory
mergedM = matrix(NA, length(IntervalGR), 1)
x = findOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE)
cM = list()
for (i in unique(subjectHits(x)) ){
a = which(subjectHits(x) == i)
tag = paste0("S",i)
if (length(a)<2){
cM[[tag]] = NA
}else if (nrow(na.omit(MM[a,]))<2) {
cM[[tag]] = NA
} else {
b = as.vector(cor(t(na.omit(MM[a,]))))
cM[[tag]] = median(b[b!=1])
}
}
aT = do.call(rbind,cM)
aID = as.character(gsub("S","",rownames(aT)))
aMM = as.matrix(aT[,1])
mergedM[as.integer(aID), ] = aMM
colnames(mergedM) = Tag
## return
return(mergedM)
}
###############################################
# 1. random region : create by bedtools "shuffle" keep cpg density and region length compare with cancer MHB
###############################################
Random_regions = toGRanges("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig6/regions/MHB_Callable_random.bed")
vFiles = list.files(sourceFolder)
Ns = length(vFiles)
Random_List = list()
for(i in 1:Ns){
Tag = gsub(".RData", "", vFiles[i])
# cat("Processing", Tag, "\n")
fileIn = paste0(sourceFolder, "/", vFiles[i])
load(fileIn)
assign("MM", get(Tag))
rm(list = Tag)
Random_List[[i]] = ArrayIMCorrelation(MM, Random_regions, RnBeads_450K_hg19_GR,Tag)
}
names(Random_List) = gsub(".RData", "", vFiles)
Random_List = do.call(cbind,Random_List)
mcols(Random_regions) = Random_List
save(Random_List,Random_regions,file="TCGA_Random_ArrayCorrelation.RData")
##################################################
# 2. CGI regions
#################################################
CGI = toGRanges("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig6/regions/hg19_cpgIsland.bed")
vFiles = list.files(sourceFolder)
Ns = length(vFiles)
CGI_List = list()
for(i in 1:Ns){
Tag = gsub(".RData", "", vFiles[i])
# cat("Processing", Tag, "\n")
fileIn = paste0(sourceFolder, "/", vFiles[i])
load(fileIn)
assign("MM", get(Tag))
rm(list = Tag)
CGI_List[[i]] = ArrayIMCorrelation(MM, CGI, RnBeads_450K_hg19_GR,Tag)
}
names(CGI_List) = gsub(".RData", "", vFiles)
CGI_List = do.call(cbind,CGI_List)
mcols(CGI) = CGI_List
save(CGI_List,CGI,file="TCGA_CGI_ArrayCorrelation.RData")
####################################################
## 3. Cancer-specific MHBs regions
###################################################
path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor"
files = list.files(path)
cTList=list()
for (cT in files ){
# Read Cancer MHB
Cancer_MHB = toGRanges(paste0(path,"/",cT))
Ns = length(vFiles)
cList = list()
for(i in 1:Ns){
Tag = gsub(".RData", "", vFiles[i])
# cat("Processing", Tag, "\n")
fileIn = paste0(sourceFolder, "/", vFiles[i])
load(fileIn)
assign("MM", get(Tag))
rm(list = Tag)
cList[[i]] = ArrayIMCorrelation(MM, Cancer_MHB, RnBeads_450K_hg19_GR,Tag)
}
names(cList) = gsub(".RData", "", vFiles)
mcols(Cancer_MHB)= do.call(cbind,cList)
cTList[[cT]] = Cancer_MHB
}
names(cTList) = files %>% str_split_i("[_.]",i=2)
## save
save(cTList,file="TCGA_Cancer_specific_MHB_ArrayCorrelation.RData")
##################################################
# 4. 81567 MHB region
#################################################
tMHB = toGRanges("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed.gz")
vFiles = list.files(sourceFolder)
Ns = length(vFiles)
tMHB_List = list()
for(i in 1:Ns){
Tag = gsub(".RData", "", vFiles[i])
# cat("Processing", Tag, "\n")
fileIn = paste0(sourceFolder, "/", vFiles[i])
load(fileIn)
assign("MM", get(Tag))
rm(list = Tag)
tMHB_List[[i]] = ArrayIMCorrelation(MM, tMHB, RnBeads_450K_hg19_GR,Tag)
}
names(tMHB_List) = gsub(".RData", "", vFiles)
tMHB_List = do.call(cbind,tMHB_List)
mcols(tMHB) = tMHB_List
save(tMHB_List,tMHB,file="TCGA_tMHB_ArrayCorrelation.RData")
###############################################
# 5. plot
################################################
vFiles=names(cTList)
data = list()
for (i in vFiles){
if ( i=="BRCA" ) {
tag = i
}else if(i=="CESC"){
tag = i
}else if(i=="COAD") {
tag = c("COAD","COADREAD","READ")
}else if(i=="ESCA"){
tag = c("ESCA","STES")
}else if(i=="HNSC"){
tag = "HNSC"
}else if( i=="LIHC"){
tag = "LIHC"
}else if(i=="NSCLC"){
tag = c("LUAD","LUSC")
}else if ( i=="OV"){
tag = "OV"
}else if (i=="PACA"){
tag = "PAAD"
}else if(i=="STAD"){
tag = c("STAD","STES")
}else if( i=="THCA") {
tag = "THCA"
}
for (j in tag){
## Cancer-specific MHB
x = na.omit(mcols(cTList[[i]])[j])
x$Group = "MHB"
x$Type = j
colnames(x) <- c("res","Group","Type")
## union cancer-type MHB
xx = na.omit(mcols(tMHB)[j])
xx$Group = "tMHB"
xx$Type = j
colnames(xx) <- c("res","Group","Type")
## CGI
y= na.omit(mcols(CGI)[j])
y$Group = "CGI"
y$Type = j
colnames(y) <- c("res","Group","Type")
## ramdom
z = na.omit(mcols(Random_regions)[j])
z$Group = "Random"
z$Type = j
colnames(z) <- c("res","Group","Type")
mat = as.data.frame(rbind(x,xx,y,z))
data[[j]] = mat
}
}
res_data <- do.call(rbind,data)
res_data$Group = factor(res_data$Group,levels = c("MHB","tMHB","CGI","Random"))
## boxplot
p<-ggplot(res_data, aes(x=Type, y=res, fill= Group )) +
geom_boxplot(outlier.shape = NA,position = position_dodge(width = 0.8)) +
labs(x= "",y="Pearson's Correlation Coefficience (r)") +
scale_fill_brewer(palette = "Set1")+
theme_bw() +
theme(panel.background = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
axis.line=element_line(color="black"),
axis.text.x = element_text(color="black",size=10,angle = 60,hjust = 1),
axis.text.y = element_text(color="black",size=10),
axis.ticks = element_line(color="black"),
axis.ticks.length = unit(5, "pt"),
legend.position="bottom")
print(p)
## random
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/AUC/TCGA_MHB_Call_able_Methylation_random_0421.RData")
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/AUC/TCGA_MHB_Call_able_MethylVariations_random_0421.RData")
rList_random <- MM
metVar_random <- metVar
## COAD MHB
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/AUC/TCGA_MHB_Call_able_Methylation_0421.RData")
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/AUC/TCGA_MHB_Call_able_MethylVariations_0421.RData")
rList <- MM
metVar <- metVar
MHB = toGRanges("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/regions/MHB_Call_able.cpg.bed") %>%
as.character()
MHB_random = toGRanges("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/regions/MHB_Callable_random.bed") %>%
as.character()
## loess fitted
loessCurve <- function(dataFrame, xValue) {
loessDf <- loess(var ~ mean, data = dataFrame, span =0.7,se = FALSE) ###Random :span = 1
yloessDfVals <- c(0,loessDf$fitted,0) ### var
#yloessDfVals[yloessDfVals < 0] <- 0
xloessDfVals <- c(0,loessDf$x,1) ### mean
loessDfAUC <- round(pracma::trapz(xloessDfVals, yloessDfVals), 3)
yValue <- mean(yloessDfVals[which(round(xloessDfVals, 2) == xValue)])
list(xloessDfVals = xloessDfVals, yloessDfVals = yloessDfVals, loessDfAUC = loessDfAUC, yValue = yValue)
}
######################################################
## test demo MCB vs. Random
######################################################
tumor = "COAD"
## mcb
test_methy <- rList[[tumor]] %>% as.data.frame() %>% mutate(name=MHB) %>%
pivot_longer(-c(name),names_to = "sample",values_to = "mean")
test_var <- metVar[[tumor]] %>% as.data.frame() %>% mutate(name=MHB) %>%
pivot_longer(-c(name),names_to = "sample",values_to = "var")
test_mcb <- test_methy %>% inner_join(test_var,by=c("name","sample")) %>% na.omit()
set1 <- data.frame(MM=test_mcb$mean,var=test_mcb$var,type="mcb") %>% mutate(group = cut_interval(MM,length = 0.005)) %>%
group_by(group,type) %>% summarise(mean = mean(MM), var = mean(var)) %>%
ungroup()
mcb_demo <- loessCurve(set1, 0.5)
set11 <- data.frame(MM=mcb_demo$xloessDfVals,var=mcb_demo$yloessDfVals,type="mcb")
## random
test_methy_random <- rList_random[[tumor]] %>% as.data.frame() %>% mutate(name=MHB_random) %>%
pivot_longer(-c(name),names_to = "sample",values_to = "mean")
test_var_random <- metVar_random[[tumor]]%>% as.data.frame() %>% mutate(name=MHB_random) %>%
pivot_longer(-c(name),names_to = "sample",values_to = "var")
test_random <- test_methy_random %>% inner_join(test_var_random,by=c("name","sample")) %>% na.omit()
set2 <- data.frame(MM=test_random$mean,var=test_random$var,type="random") %>% mutate(group = cut_interval(MM,length = 0.005)) %>%
group_by(group,type) %>% summarise(mean = mean(MM), var = mean(var)) %>%
ungroup()
random_demo <-loessCurve(set2, 0.5)
set22 <- data.frame(MM=random_demo$xloessDfVals,var=random_demo$yloessDfVals,type="random")
## merged and plot
dataset <- rbind(set11,set22) %>% rename_with(~c("MM","var","type"))
p<-ggplot(dataset, aes(x=MM, y=var, color=type)) +
geom_line(aes(color=type)) +
scale_color_manual(values = c("mcb" = "firebrick", "random" = "steelblue")) +
geom_area(aes(fill=type),alpha = 0.3,position="identity") +
scale_fill_manual(values = c("mcb" = "firebrick", "random" = "steelblue")) +
ggtitle("TCGA-COAD", subtitle = "Methylation & Variation") +
labs(x = "Methylation", y = "Variation") +
scale_y_continuous(expand=c(0,0),limits = c(0,0.08)) +
annotate("text",x=0.45,y=0.045,label=paste0("MCB AUC = ",round(mcb_demo$loessDfAUC,3)),size=3,color="firebrick") +
annotate("text",x=0.45,y=0.04,label=paste0("Random AUC = ",round(random_demo$loessDfAUC,3)),size=3,color="steelblue") +
geom_segment(x = 0.5, xend = 0.5, y = 0, yend = random_demo$yValue,
linetype = "dashed", color = "gray",size=0.5) +
geom_segment(x = 0, xend = 0.5, y = random_demo$yValue, yend = random_demo$yValue,
linetype = "dashed", color = "gray",size=0.5) +
geom_segment(x = 0, xend = 0.5, y = mcb_demo$yValue, yend = mcb_demo$yValue,
linetype = "dashed", color = "gray",size=0.5) +
geom_text(label = round(mcb_demo$yValue,3), x = 0, y = mcb_demo$yValue,
vjust = 1.5, color = "firebrick", size = 3) +
geom_text(label = round(random_demo$yValue,3), x = 0, y = random_demo$yValue,
vjust = 1.5, color = "steelblue", size = 3) +
theme_bw() +
theme(panel.background = element_blank(),
panel.border = element_rect(color = "black", linewidth = 0.8,linetype = "solid"),
panel.grid = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.ticks = element_line(linewidth = 0.5,color="black"),
axis.line = element_blank())
print(p)
## 1. MHB AUC score (Sample-level)
tag <- names(MM)
AUC_MHB_score <- pbapply::pblapply(tag, function(x) {
cat(x, "\n")
# MHB methylation data
methyl <- MM[[x]] %>% as.data.frame()
# MHB variation data
var <- metVar[[x]] %>% as.data.frame()
# ROC analysis
s1 <- names(methyl)
AUC_Scores <- lapply(s1,function(z){
temp <- data.frame("mean" = methyl[[z]],"var"=var[[z]]) %>%
na.omit() %>% arrange(mean)
fit <- loessCurve(temp, 0.5)
return(fit$loessDfAUC)
})
# rename
names(AUC_Scores) <- s1
score <- do.call(rbind, AUC_Scores) %>% as.data.frame() %>%
dplyr::rename(AUC = V1) %>% mutate(Type = x) %>%
rownames_to_column(var = "ID") %>%
filter(!str_detect( ID %>% str_split_i("-",i=4) ,"11")) %>%
mutate(cluster = ifelse(AUC > median(AUC), "High", "Low"))
return(score)
})
ACC
BLCA
BRCA
CESC
CHOL
COAD
COADREAD
DLBC
ESCA
GBM
GBMLGG
HNSC
KICH
KIPAN
KIRC
KIRP
LAML
LGG
LIHC
LUAD
LUSC
MESO
OV
PAAD
PCPG
PRAD
READ
SARC
SKCM
STAD
STES
TGCT
THCA
THYM
UCEC
UCS
UVM
names(AUC_MHB_score) <- tag
## AUC scores
xx = do.call(rbind,AUC_MHB_score) %>% as.data.frame() %>% arrange(Type, desc(AUC))
## remove normal samples
xx <- xx %>% mutate(cancer = ID %>% str_split_i("-",i=4),
TT = ifelse(str_detect(cancer,"11"),"Normal","Tumor")) %>%
filter(TT != "Normal")
## TPM expression
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/AUC/Firehose_Expression_Tumor_Primary.RData")
## AUC MHB variation
AUC_MHB_score <- xx %>% mutate(sample = str_sub(ID,1,12))
cancer_type <- names(Firehose_Expression_Tumor)
## 2. AUC low vs high
TCGA_AUC_DEG <- pbapply::pblapply(cancer_type,function(i){
## groupping
groups= AUC_MHB_score %>% filter(Type == i, str_detect(ID %>% str_sub(13,15), "-01|-03"))
## common samples (DNAm & TPM)
shared_samples <- intersect(colnames(Firehose_Expression_Tumor[[i]]),groups$sample) %>% unique()
groups <- groups %>% filter(sample %in% shared_samples) %>%
mutate(cluster = ifelse(AUC > median(AUC), "High", "Low"),
cluster=fct_inorder(cluster))
## data matrix log2(TPM+1)
expset <- Firehose_Expression_Tumor[[i]]
expset <- expset[which(rowMeans(expset)>=1),]
expset <- expset[,groups$sample]
## pheno data
Group <- groups$cluster
design<-model.matrix(~Group)
## DEG limma fit
fit<-lmFit(expset,design)
fit1<-eBayes(fit)
options(digits = 4)
all_diff<-topTable(fit1,coef=2,adjust='BH',number = 30000,sort.by = 'logFC')
all_diff$DEG <- ifelse(all_diff$logFC >=1 & all_diff$adj.P.Val<= 0.05, "Up",
ifelse(all_diff$logFC <= -1 & all_diff$adj.P.Val <=0.05, "Down","NC"))
tT = subset(all_diff,select=c("P.Value","adj.P.Val","logFC","DEG"))
colnames(tT)=c("P_Value","FDR","logFC","DEG")
tT <- tT[rownames(expset),]
matrix <- expset %>% as.data.frame() %>% cbind(tT)
return(matrix)
})
names(TCGA_AUC_DEG) <- cancer_type
## The number of DEGs
counts<- lapply(cancer_type,function(x){ TCGA_AUC_DEG[[x]] %>%
filter(DEG !="NC") %>% count(DEG) %>% mutate(type=x)})
counts <- do.call(rbind,counts)
## PLOT
p<-ggplot(counts,aes(x=fct_rev(type),y=n,fill=fct_rev(DEG))) +
geom_bar(stat = "identity",position = position_dodge(width = 0.8),linewidth=0.8) +
labs(x="",y="Number of genes") +
scale_fill_brewer(palette = "Set1") +
theme_bw() +
theme(panel.background = element_blank(),
panel.grid = element_blank(),
axis.text.x =element_text(angle = 90,hjust = 1)) +
coord_flip()
print(p)
## 1. AUC score low vs high (DEGs)
gene_list<-lapply(TCGA_AUC_DEG,function(x){
genes <- x %>% dplyr::filter(DEG != "NC") %>%
dplyr::select("FDR","logFC","DEG") %>%
rownames_to_column(var="genes")
})
names(gene_list) <- cancer_type
pan_DEGs <- do.call(rbind,gene_list) %>% mutate(Type = rownames(.) %>% str_split_i("\\.",1))
## 2. Hallmark enrichment
m_t2g <- msigdbr(species = "Homo sapiens", category = "H") %>%
dplyr::select(gs_name, entrez_gene)
## convert : the existence of genes
pan_genes <- pan_DEGs %>% group_by(Type) %>% group_split()
hallmarks_enrichment <- lapply(pan_genes,function(x){
up_genes <- x %>% filter(DEG=="Up") %>% pull(genes) %>% unique()
down_genes <- x %>% filter(DEG=="Down") %>% pull(genes) %>% unique()
## up
up_genes_entrez = bitr(up_genes,
fromType="SYMBOL",
toType="ENTREZID",
OrgDb="org.Hs.eg.db")
## down
down_genes_entrez = bitr(down_genes,
fromType="SYMBOL",
toType="ENTREZID",
OrgDb="org.Hs.eg.db")
em_up <- enricher(up_genes_entrez$ENTREZID, TERM2GENE=m_t2g,pAdjustMethod= "BH") %>%
as.data.frame() %>% mutate(DEG = "Up")
em_down <- enricher(down_genes_entrez$ENTREZID, TERM2GENE=m_t2g,pAdjustMethod= "BH") %>%
as.data.frame() %>% mutate(DEG = "Down")
em <- rbind(em_up,em_down) %>% filter(p.adjust < 0.05)
return(em)
})
names(hallmarks_enrichment) <- pan_DEGs %>% pull(Type) %>% unique()
## dotplot
df <- do.call(rbind,hallmarks_enrichment) %>% filter(DEG == "Up") %>%
mutate(Type =rownames(.) %>% str_split_i("\\.",1)) %>%
dplyr::select(ID,GeneRatio,p.adjust,Count,Type) %>%
mutate(GeneRatio = (GeneRatio %>% str_split_i("/",1) %>% as.numeric())/(GeneRatio %>% str_split_i("/",2) %>% as.numeric()),
Count = as.numeric(Count)) %>% arrange(p.adjust)
colors <- colorRampPalette(rev(brewer.pal(n = 11, name = "RdYlBu")))(100)
p<-ggplot(df, aes(x = Type, y = reorder(ID, GeneRatio))) +
geom_point(aes(size = GeneRatio, color = -log10(p.adjust))) +
#scale_color_gradient(low = "red", high = "blue") +
#scale_color_continuous(type = "") +
theme_bw() +
scale_color_gradientn(colors = colors) +
labs(x="",y="") +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(size=6))
print(p)
## 1. t.test (Mutation vs. WT )
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/AUC/TCGA_Gene_level_MutsigCV.RData")
<- names(Gene_level_MutsigCV)
tag
## 2. TCGA MHB Mut vs WT (Sample-level)
<- names(MM)
files <- pbapply::pblapply(file, function(x) {
AUC_sig_MCB ## load Sigificant Mutation genes
<- Gene_level_MutsigCV[[x]] %>% names()
gene_mutation
if (length(gene_mutation)!=0) {
<- pbapply::pblapply(gene_mutation, function(y) {
gene_AUC_Sig ## 1. Sample classification
## load sample_Mut
<- Gene_level_MutsigCV[[x]] %>%
sample_Mut_ID ::select(all_of(y)) %>%
dplyr::rename_with(~c("gene")) %>%
dplyr::filter( gene == 1) %>% rownames() %>%
dplyrstr_sub(1,16)
## load sample_WT
<-Gene_level_MutsigCV[[x]] %>%
sample_WT_ID ::select(all_of(y)) %>%
dplyr::rename_with(~c("gene")) %>%
dplyr::filter( gene == 0) %>% rownames() %>% str_sub(1,16)
dplyr#########################################
## 2. MHB Methylation data
# MHB methylation data
<- MM[[x]] %>% as.data.frame() %>%
methyl_WT ::select(contains(sample_WT_ID))
dplyr<- MM[[x]] %>% as.data.frame() %>%
methyl_Mut ::select(contains(sample_Mut_ID))
dplyr# MHB variation data
<- metVar[[x]] %>% as.data.frame() %>%
var_WT ::select(contains(sample_WT_ID))
dplyr<- metVar[[x]] %>% as.data.frame() %>%
var_Mut ::select(contains(sample_Mut_ID))
dplyr## 3. ROC analysis
# WT
<- names(methyl_WT)
s1 <- lapply(s1,function(z){
AUC_WT <- data.frame("mean" = methyl_WT[[z]],"var"=var_WT[[z]]) %>% na.omit() %>%
temp arrange(mean)
<- loessCurve(temp, 0.5)
fit return(fit$loessDfAUC)
})# Mut
<- names(methyl_Mut)
s2 <- lapply(s2,function(z){
AUC_Mut <- data.frame("mean" = methyl_Mut[[z]],"var"=var_Mut[[z]]) %>% na.omit() %>%
temp arrange(mean)
<- loessCurve(temp, 0.5)
fit return(fit$loessDfAUC)
})
## 4. test statistic
<- as.numeric(AUC_WT)
t1 <- as.numeric(AUC_Mut)
t2 <- ifelse(length(t1) <3 || length(t2) <3, NA,t.test(t1,t2) %>% broom::tidy() %>% dplyr::select(p.value) %>% as.numeric() )
p <- mean(t2) - mean(t1)
fc return(c(mean(t2),mean(t1),p,fc))
})names(gene_AUC_Sig) <- gene_mutation
<- do.call(rbind, gene_AUC_Sig) %>%
gene_significant as.data.frame() %>%
::rename(AUC_Mut = V1 ,AUC_WT = V2,p.value = V3, FC = V4) %>% mutate(FDR = p.adjust(p.value, method = "fdr", n = length(p.value)))
dplyrreturn(gene_significant)
else {
}NA
}
})names(AUC_sig_MCB) <- file
= do.call(rbind,AUC_sig_MCB) %>% as.data.frame() %>%
xx rownames_to_column(var = "gene") %>%
separate(gene, into = c("cancer", "gene"), sep = "\\.", remove = FALSE)
## save
write.table(xx, file = paste0("AUC_MHB_Callable/TCGA_Mutation_",file,"_AUC_MHB_ttest_sig.txt"),sep = "\t",row.names = FALSE,quote = FALSE)
path <- "/sibcb1/bioinformatics/zhangzhiqiang/MHB_Cancer/Mut_Var/AUC_MHB_Callable"
files <- list.files(path = path,pattern = "MHB_ttest_sig.txt")
test.data <- lapply(files,function(x){
read.table(paste0(path,x),header = T,sep = "\t") %>%
na.omit() %>%
filter(p.value <= 0.05)})
res <- do.call(rbind,test.data) %>% as.data.frame() %>%
mutate(type = ifelse(FC > 0, "increased","decreased")) %>%
mutate(ID = str_c(cancer,"_",gene))
## volcano plot
res1 <- res %>% filter(abs(FC) > 0.0004,p.value <= 0.0001)
p<-ggplot(res,aes(x = 1000*FC,y = -log10(FDR))) +
geom_point(shape = 16,size = 1.5,aes(color = type)) +
xlim(c(-12,12)) +
scale_y_continuous(expand = c(0,0),limits = c(0,13)) +
scale_color_brewer(palette = "Set1",direction = -1) +
theme_classic() +
labs(x = "Fold Change (10-3)",y = "-log10(q Value)") +
theme(panel.background = element_blank(),panel.grid =element_blank(),
axis.ticks.length=unit(5,'pt'),
axis.text.x =element_text(angle=0,vjust=0.5),
plot.title=element_text(hjust=0.5)) +
ggrepel::geom_text_repel(data = res1,aes(x = 1000*FC,
y = -log10(FDR),label = ID),
color = "black",size = 2,
max.overlaps = 100000)
print(p)
# Show the MHB AUC of GSE50774 (GBMLGG)
## 1. load GSE50774 data
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/MHB_correlation/RnBeads_450K_hg19_Probes_GR.RData")
RnBeads_450K_hg19_GR = RnBeads_450K_hg19_GR[RnBeads_450K_hg19_GR$QC]
mean0 = function(z) mean(z, na.rm = TRUE)
load("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/AUC/GSE50774.RData")
meta_info = meta %>% filter(str_detect(`idh1/h3f3a mutation status:ch1`,"IDH1")) %>%
mutate(mutation = ifelse(str_detect(`idh1/h3f3a mutation status:ch1`,"MUT"), "Mut", "WT"))
WT_sample_id <- meta_info %>% filter(mutation == "WT") %>% rownames()
Mut_sample_id <- meta_info %>% filter(mutation == "Mut") %>% rownames()
## MHB
GBM_MCB = toGRanges("/sibcb1/bioinformatics/zhangzhiqiang/CancerMHB/Fig6/regions/MHB_Call_able.cpg.bed")
## methylation
ArrayIntervalMethylation <- function(MM, IntervalGR, cpgGR){
# check seqlevels
if(length(intersect(seqlevels(IntervalGR), seqlevels(cpgGR))) == 0)
stop("No shared seqlevels found.\n")
# only keep CpGs that locate in intervals
Idx1 = countOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE) > 0
Idx2 = names(cpgGR) %in% rownames(MM)
Idx = Idx1 & Idx2
if(sum(Idx) == 0)
stop("No CpGs found in pre-defined regions.\n")
cpgGR = cpgGR[Idx, ]
# keep shared CpGs
cNames = intersect(rownames(MM), names(cpgGR))
cpgGR = cpgGR[cNames]
MM = MM[cNames, ]
# Sample-by-sample processing to control memory
mergedM = matrix(NA, length(IntervalGR), ncol(MM))
x = findOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE)
MM.adj <- MM[queryHits(x),]
aT = aggregate(MM.adj, by = list(factor(subjectHits(x))), FUN="mean0")
aID = as.character(aT[,1])
aMM = as.matrix(aT[,-1])
mergedM[as.integer(aID), ] = aMM
colnames(mergedM) = colnames(MM)
## return
return(mergedM)
}
## variation
var0 <- function(x) ifelse(length(x)>=3, var(x,na.rm=T),NA)
ArrayIntervalMethylVar <- function(MM, IntervalGR, cpgGR){
# check seqlevels
if(length(intersect(seqlevels(IntervalGR), seqlevels(cpgGR))) == 0)
stop("No shared seqlevels found.\n")
# only keep CpGs that locate in intervals
Idx1 = countOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE) > 0
Idx2 = names(cpgGR) %in% rownames(MM)
Idx = Idx1 & Idx2
if(sum(Idx) == 0)
stop("No CpGs found in pre-defined regions.\n")
cpgGR = cpgGR[Idx, ]
# keep shared CpGs
cNames = intersect(rownames(MM), names(cpgGR))
cpgGR = cpgGR[cNames]
MM = MM[cNames, ]
# Sample-by-sample processing to control memory
mergedM = matrix(NA, length(IntervalGR), ncol(MM))
x = findOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE)
MM.adj <- MM[queryHits(x),]
aT = aggregate(MM.adj, by = list(factor(subjectHits(x))), FUN="var0")
aID = as.character(aT[,1])
aMM = as.matrix(aT[,-1])
mergedM[as.integer(aID), ] = aMM
colnames(mergedM) = colnames(MM)
## return
return(mergedM)
}
## methylatin & variation
GBM_Methylation = ArrayIntervalMethylation(MeM, GBM_MCB, RnBeads_450K_hg19_GR)
GBM_MethylVar = ArrayIntervalMethylVar(MeM, GBM_MCB, RnBeads_450K_hg19_GR)
## 2. MHB AUC analysis
# MHB methylation data
methyl_WT <- GBM_Methylation %>% as.data.frame() %>% dplyr::select(all_of(WT_sample_id)) %>%
mutate(ID=1:nrow(.)) %>% pivot_longer(-ID, names_to = "sample", values_to = "mean")
methyl_Mut <- GBM_Methylation %>% as.data.frame() %>% dplyr::select(all_of(Mut_sample_id)) %>%
mutate(ID=1:nrow(.)) %>% pivot_longer(-ID, names_to = "sample", values_to = "mean")
# MHB variation data
var_WT <- GBM_MethylVar %>% as.data.frame() %>% dplyr::select(all_of(WT_sample_id)) %>%
mutate(ID=1:nrow(.)) %>% pivot_longer(-ID, names_to = "sample", values_to = "var")
var_Mut <- GBM_MethylVar %>% as.data.frame() %>% dplyr::select(all_of(Mut_sample_id)) %>%
mutate(ID=1:nrow(.)) %>% pivot_longer(-ID, names_to = "sample", values_to = "var")
# Merged
WT <- methyl_WT %>% inner_join(var_WT, by = c("ID","sample")) %>% na.omit() %>% arrange(mean) %>%
mutate(group = cut_interval(mean,length = 0.005)) %>%
group_by(group) %>% summarise(mean = mean(mean), var = mean(var)) %>%
ungroup()
Mut <- methyl_Mut %>% inner_join(var_Mut, by = c("ID","sample")) %>% na.omit() %>% arrange(mean) %>%
mutate(group = cut_interval(mean,length = 0.005)) %>%
group_by(group) %>% summarise(mean = mean(mean), var = mean(var)) %>%
ungroup()
## 3. ROC analysis
# WT
mcb_WT <- loessCurve(WT, 0.5)
# Mut
mcb_Mut <- loessCurve(Mut, 0.5)
## 4. t.test
set_WT <- data.frame(MM=mcb_WT$xloessDfVals,var=mcb_WT$yloessDfVals,type="WT")
set_Mut <- data.frame(MM=mcb_Mut$xloessDfVals,var=mcb_Mut$yloessDfVals,type="Mut")
dataset1 <- rbind(set_WT, set_Mut)
## 5. plot
p<-ggplot(dataset1, aes(x=MM, y=var, color=type)) +
geom_line(aes(color=type)) +
scale_color_manual(values = c("Mut" = "firebrick", "WT" = "steelblue")) +
geom_area(aes(fill = type),alpha = 0.3,position="identity") +
scale_fill_manual(values = c("Mut" = "firebrick", "WT" = "steelblue")) +
ggtitle(paste0("GBM (GSE50774)")) +
labs(x = "Methylation", y = "Variation") +
scale_y_continuous(expand=c(0,0),limits = c(0,0.05)) +
annotate("text",x=0.45,y=0.045,label=paste0("WT-IDH1 AUC = ",round(mcb_WT$loessDfAUC,3)),size=3,color="steelblue") +
annotate("text",x=0.45,y=0.04,label=paste0("Mut-IDH1 AUC = ",round(mcb_Mut$loessDfAUC,3)),size=3,color="firebrick") +
geom_segment(x = 0.5, xend = 0.5, y = 0, yend = mcb_Mut$yValue,
linetype = "dashed", color = "gray",size=0.5) +
geom_segment(x = 0, xend = 0.5, y = mcb_WT$yValue, yend = mcb_WT$yValue,
linetype = "dashed", color = "gray",size=0.5) +
geom_segment(x = 0, xend = 0.5, y = mcb_Mut$yValue, yend = mcb_Mut$yValue,
linetype = "dashed", color = "gray",size=0.5) +
geom_text(label = round(mcb_Mut$yValue,3), x = 0, y = mcb_Mut$yValue,
vjust = 1.5, color = "firebrick", size = 3) +
geom_text(label = round(mcb_WT$yValue,3), x = 0, y = mcb_WT$yValue,
vjust = 1.5, color = "steelblue", size = 3) +
theme_bw() +
theme(panel.background = element_blank(),
panel.border = element_rect(color = "black", linewidth = 0.8,linetype = "solid"),
panel.grid = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.ticks = element_line(linewidth = 0.5,color="black"),
axis.line = element_blank())
print(p)