As an example, we use the pbmc3k
data from SeuratData, with details shown in pbmc3k_tutorial. For convenience, we provide the data in the PaaSc package.
data <- Read10X(data.dir = "data/pbmc3k_filtered_gene_bc_matrices/hg19")
object <- CreateSeuratObject(counts = data, min.cells = 3, min.features = 200)
object
An object of class Seurat
13714 features across 2700 samples within 1 assay
Active assay: RNA (13714 features, 0 variable features)
1 layer present: counts
PaaSc computes pathway activity score for each pathway independently and requires a background defined as a collection of pathway in the database. In general, the pathway of interest should be part of the background. Take the B_cell_activation
from Gene Ontology (GO) as an example, the background pathway sets can be downloaded from MSigDB.
# MCA
object <- computeMCA(object, nmcs = 20)
1.194 sec elapsed
17.916 sec elapsed
1.596 sec elapsed
# Building gene rate for pathway and background pathway sets
background_geneset <- "data/kegg_legacy.gmt"
B_cell_activation <- readLines("data/B_cell_activation.txt")
pathway_geneset <- list(B_cell_activation = B_cell_activation)
gene_rate <- getGeneRate(background.geneset = background_geneset, pathway.geneset = pathway_geneset)
head(gene_rate)
background B_cell_activation
ABCA1 0.005376344 0
ABCA10 0.005376344 0
ABCA12 0.005376344 0
ABCA13 0.005376344 0
ABCA2 0.010752688 0
ABCA3 0.005376344 0
# Regression of gene loading (Y) and gene rate (X)
regression_data <- doRegression(object, gene.rate = gene_rate)
head(regression_data[, 1:5])
mca_1 mca_2 mca_3
B_cell_activation.t 1.5047288 0.09987953 -3.289811947
B_cell_activation.background.t 0.3260923 0.47468205 -0.012395800
B_cell_activation.pvalue 0.1324888 0.92044599 0.001012992
B_cell_activation.background.pvalue 0.7443751 0.63504478 0.990110579
mca_4 mca_5
B_cell_activation.t -4.924049e+00 -2.724654808
B_cell_activation.background.t 6.320301e-01 -0.971835309
B_cell_activation.pvalue 8.889976e-07 0.006470545
B_cell_activation.background.pvalue 5.274106e-01 0.331203156
As shown above, mca_3
,mca_4
and mca_5
are significantly associated with B_cell_activation
pathway.
# Computing the pathway activity score
score_data <- computeScore(object, regression.data = regression_data, weight = FALSE, normalize = "z-score")
head(score_data)
B_cell_activation
AAACATACAACCAC-1 -0.34014887
AAACATTGAGCTAC-1 2.09314958
AAACATTGATCAGC-1 -1.10995620
AAACCGTGCTTCCG-1 0.04333788
AAACCGTGTATGCG-1 0.26093785
AAACGCACTGGTAC-1 -0.24407591
# visualization of pathway activity score using UMAP
object <- FindVariableFeatures(object, verbose = FALSE)
object <- ScaleData(object, verbose = FALSE)
object <- RunPCA(object, verbose = FALSE)
object <- FindNeighbors(object, verbose = FALSE)
object <- RunUMAP(object, dims = 1:20, verbose = FALSE)
# Ceiling score for better visualization
cscore <- score_data[colnames(object), 1]
cscore[cscore > quantile(cscore, 0.99)] = quantile(cscore, 0.99)
cscore[cscore < quantile(cscore, 0.01)] = quantile(cscore, 0.01)
object@meta.data$B_cell_activation <- cscore
FeaturePlot(object, features = "B_cell_activation", cols = c('white', 'darkred'))
Sometime, it’s helpful to binarize the pathway activity scores into different states, such as positive and negative. Here, we use Gaussian mixture model (GMM) to binarize the pathway activity scores.
# binarization
label_data <- doBinarization(score_data, n.cluster = 2, method = "GMM")
number of iterations= 10
head(label_data)
B_cell_activation.label
AAACATACAACCAC-1 negative
AAACATTGAGCTAC-1 positive
AAACATTGATCAGC-1 negative
AAACCGTGCTTCCG-1 negative
AAACCGTGTATGCG-1 negative
AAACGCACTGGTAC-1 negative
threshold <- min(score_data[rownames(label_data[label_data['B_cell_activation.label'] == 'positive', ]), 'B_cell_activation'])
ggplot(score_data, aes(x = B_cell_activation)) +
geom_histogram(bins = 50, fill = "grey", alpha = 0.7) +
geom_vline(xintercept = threshold, color = "red", linetype = "solid") +
annotate("text", x = threshold, y = 0.2, label = paste0("x=", signif(threshold, 2)), hjust = 0, vjust = 0, color = "black", size = 5) +
xlab("Activity score") + ylab("Frequency") + theme_minimal()
# visualization of cell states
object@meta.data$B_cell_activation_label <- label_data[colnames(object), ]
DimPlot(object, group.by = "B_cell_activation_label")
We may further evaluate the cell-type specificity using Area Under the Receiver Operating Characteristic Curve (AUC-ROC) value. Here, we use the pre-annotated celltype information to measure whether B_cell_activation
is specific to B cells or not.
annotation_file <- "data/pbmc3k_seurat_annotation.txt"
seurat_annotations <- read.table(annotation_file, header = TRUE, row.names = 1, sep = "\t")
# Plot the pre-annotated celltype in UMAP
object@meta.data$seurat_annotations <- seurat_annotations[colnames(object), ]
DimPlot(object, group.by = "seurat_annotations")
# Distribution of B cell activation score in different cell types
score_data_withanno <- cbind(score_data, seurat_annotations)
ggplot(score_data_withanno, aes(x = seurat_annotations, y = B_cell_activation, color = seurat_annotations)) +
geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2, size = 0.01) + theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
theme(plot.title = element_text(hjust = 0.5)) + theme(legend.title=element_blank()) +
theme(legend.position="none") + theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45, size = 12))
# Calculating the AUC-ROC value
label <- rep('others', nrow(score_data_withanno))
label[score_data_withanno$seurat_annotations == "B"] <- "B"
roc_empirical <- rocit(score = score_data_withanno$B_cell_activation, class = label, negref = "others")
title <- paste0('AUC:', strsplit(as.character(summary(roc_empirical)[4,]), ':')[[1]][2])
Method used: empirical
Number of positive(s): 344
Number of negative(s): 2356
Area under curve: 0.9981
R version 4.4.2 (2024-10-31)
Platform: x86_64-apple-darwin20
Running under: macOS Sequoia 15.3.1
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] C/UTF-8/C/C/C/C
time zone: Asia/Shanghai
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods
[7] base
other attached packages:
[1] ROCit_2.1.2 PaaSc_1.0.0
[3] stxBrain.SeuratData_0.1.2 ifnb.SeuratData_3.1.0
[5] SeuratData_0.2.2.9001 Seurat_5.1.0
[7] SeuratObject_5.0.2 sp_2.1-4
[9] msigdbr_7.5.1 reshape2_1.4.4
[11] ggrepel_0.9.6 ggplot2_3.5.1
[13] knitr_1.48 rmarkdown_2.28
loaded via a namespace (and not attached):
[1] RcppAnnoy_0.0.22 splines_4.4.2
[3] later_1.3.2 R.oo_1.27.0
[5] tibble_3.2.1 polyclip_1.10-7
[7] fastDummies_1.7.4 lifecycle_1.0.4
[9] globals_0.16.3 lattice_0.22-6
[11] MASS_7.3-61 magrittr_2.0.3
[13] plotly_4.10.4 sass_0.4.9
[15] jquerylib_0.1.4 yaml_2.3.10
[17] httpuv_1.6.15 glmGamPoi_1.18.0
[19] sctransform_0.4.1 askpass_1.2.1
[21] spam_2.11-0 spatstat.sparse_3.1-0
[23] reticulate_1.39.0 cowplot_1.1.3
[25] pbapply_1.7-2 RColorBrewer_1.1-3
[27] abind_1.4-8 zlibbioc_1.52.0
[29] Rtsne_0.17 GenomicRanges_1.58.0
[31] R.utils_2.12.3 mixtools_2.0.0
[33] purrr_1.0.2 downlit_0.4.4
[35] BiocGenerics_0.52.0 rappdirs_0.3.3
[37] GenomeInfoDbData_1.2.13 IRanges_2.40.0
[39] S4Vectors_0.44.0 irlba_2.3.5.1
[41] listenv_0.9.1 spatstat.utils_3.1-0
[43] umap_0.2.10.0 goftest_1.2-3
[45] RSpectra_0.16-2 spatstat.random_3.3-2
[47] fitdistrplus_1.2-1 parallelly_1.38.0
[49] DelayedMatrixStats_1.28.0 leiden_0.4.3.1
[51] codetools_0.2-20 DelayedArray_0.32.0
[53] scuttle_1.16.0 tidyselect_1.2.1
[55] UCSC.utils_1.2.0 distill_1.6
[57] farver_2.1.2 viridis_0.6.5
[59] ScaledMatrix_1.14.0 matrixStats_1.4.1
[61] stats4_4.4.2 spatstat.explore_3.3-3
[63] jsonlite_1.8.9 BiocNeighbors_2.0.0
[65] progressr_0.15.0 ggridges_0.5.6
[67] survival_3.7-0 scater_1.34.0
[69] tictoc_1.2.1 segmented_2.1-3
[71] tools_4.4.2 ica_1.0-3
[73] Rcpp_1.0.13-1 glue_1.8.0
[75] gridExtra_2.3 SparseArray_1.6.0
[77] xfun_0.49 MatrixGenerics_1.18.0
[79] GenomeInfoDb_1.42.0 dplyr_1.1.4
[81] withr_3.0.2 fastmap_1.2.0
[83] fansi_1.0.6 openssl_2.2.2
[85] rsvd_1.0.5 digest_0.6.37
[87] R6_2.5.1 mime_0.12
[89] colorspace_2.1-1 scattermore_1.2
[91] tensor_1.5 spatstat.data_3.1-2
[93] R.methodsS3_1.8.2 utf8_1.2.4
[95] tidyr_1.3.1 generics_0.1.3
[97] data.table_1.16.2 httr_1.4.7
[99] htmlwidgets_1.6.4 S4Arrays_1.6.0
[101] uwot_0.2.2 pkgconfig_2.0.3
[103] gtable_0.3.6 lmtest_0.9-40
[105] SingleCellExperiment_1.28.0 XVector_0.46.0
[107] htmltools_0.5.8.1 dotCall64_1.2
[109] bookdown_0.41 fgsea_1.32.0
[111] scales_1.3.0 Biobase_2.66.0
[113] png_0.1-8 spatstat.univar_3.0-1
[115] rstudioapi_0.17.1 nlme_3.1-166
[117] cachem_1.1.0 zoo_1.8-12
[119] stringr_1.5.1 KernSmooth_2.23-24
[121] vipor_0.4.7 parallel_4.4.2
[123] miniUI_0.1.1.1 pillar_1.9.0
[125] grid_4.4.2 vctrs_0.6.5
[127] RANN_2.6.2 promises_1.3.0
[129] BiocSingular_1.22.0 beachmat_2.22.0
[131] xtable_1.8-4 cluster_2.1.6
[133] beeswarm_0.4.0 CelliD_1.14.0
[135] evaluate_1.0.1 cli_3.6.3
[137] compiler_4.4.2 rlang_1.1.4
[139] crayon_1.5.3 future.apply_1.11.3
[141] labeling_0.4.3 RcppArmadillo_14.0.2-1
[143] ggbeeswarm_0.7.2 plyr_1.8.9
[145] stringi_1.8.4 viridisLite_0.4.2
[147] deldir_2.0-4 BiocParallel_1.40.0
[149] babelgene_22.9 munsell_0.5.1
[151] lazyeval_0.2.2 spatstat.geom_3.3-3
[153] Matrix_1.7-1 RcppHNSW_0.6.0
[155] patchwork_1.3.0 sparseMatrixStats_1.18.0
[157] future_1.34.0 shiny_1.9.1
[159] SummarizedExperiment_1.36.0 highr_0.11
[161] kernlab_0.9-33 ROCR_1.0-11
[163] fontawesome_0.5.2 igraph_2.1.1
[165] memoise_2.0.1 bslib_0.8.0
[167] fastmatch_1.1-4