As an example, we use the pbmc3k data from SeuratData, with details shown in pbmc3k_tutorial. For convenience, we provide the data in the PaaSc package.
data <- Read10X(data.dir = "data/pbmc3k_filtered_gene_bc_matrices/hg19")
object <- CreateSeuratObject(counts = data, min.cells = 3, min.features = 200)
object
An object of class Seurat
13714 features across 2700 samples within 1 assay
Active assay: RNA (13714 features, 0 variable features)
1 layer present: counts
PaaSc computes pathway activity score for each pathway independently and requires a background defined as a collection of pathway in the database. In general, the pathway of interest should be part of the background. Take the B_cell_activation from Gene Ontology (GO) as an example, the background pathway sets can be downloaded from MSigDB.
# MCA
object <- computeMCA(object, nmcs = 20)
1.725 sec elapsed
16.316 sec elapsed
1.528 sec elapsed
# Building gene rate for pathway and background pathway sets
background_geneset <- "data/kegg_legacy.gmt"
B_cell_activation <- readLines("data/B_cell_activation.txt")
pathway_geneset <- list(B_cell_activation = B_cell_activation)
gene_rate <- getGeneRate(background.geneset = background_geneset, pathway.geneset = pathway_geneset)
head(gene_rate)
background B_cell_activation
ABCA1 0.005376344 0
ABCA10 0.005376344 0
ABCA12 0.005376344 0
ABCA13 0.005376344 0
ABCA2 0.010752688 0
ABCA3 0.005376344 0
# Regression of gene loading (Y) and gene rate (X)
regression_data <- doRegression(object, gene.rate = gene_rate)
head(regression_data[, 1:5])
mca_1 mca_2
B_cell_activation.t 1.5047288 -0.09987953
B_cell_activation.background.t 0.3260923 -0.47468205
B_cell_activation.pvalue 0.1324888 0.92044599
B_cell_activation.background.pvalue 0.7443751 0.63504478
mca_3 mca_4
B_cell_activation.t -3.289811947 -4.924049e+00
B_cell_activation.background.t -0.012395800 6.320301e-01
B_cell_activation.pvalue 0.001012992 8.889976e-07
B_cell_activation.background.pvalue 0.990110579 5.274106e-01
mca_5
B_cell_activation.t -2.724654808
B_cell_activation.background.t -0.971835309
B_cell_activation.pvalue 0.006470545
B_cell_activation.background.pvalue 0.331203156
As shown above, mca_3,mca_4 and mca_5 are significantly associated with B_cell_activation pathway.
# Computing the pathway activity score
score_data <- computeScore(object, regression.data = regression_data, weight = FALSE, normalize = "z-score")
head(score_data)
B_cell_activation
AAACATACAACCAC-1 -0.34014887
AAACATTGAGCTAC-1 2.09314958
AAACATTGATCAGC-1 -1.10995620
AAACCGTGCTTCCG-1 0.04333788
AAACCGTGTATGCG-1 0.26093785
AAACGCACTGGTAC-1 -0.24407591
# visualization of pathway activity score using UMAP
object <- FindVariableFeatures(object, verbose = FALSE)
object <- ScaleData(object, verbose = FALSE)
object <- RunPCA(object, verbose = FALSE)
object <- FindNeighbors(object, verbose = FALSE)
object <- RunUMAP(object, dims = 1:20, verbose = FALSE)
# Ceiling score for better visualization
cscore <- score_data[colnames(object), 1]
cscore[cscore > quantile(cscore, 0.99)] = quantile(cscore, 0.99)
cscore[cscore < quantile(cscore, 0.01)] = quantile(cscore, 0.01)
object@meta.data$B_cell_activation <- cscore
FeaturePlot(object, features = "B_cell_activation", cols = c('white', 'darkred'))
Sometime, it’s helpful to binarize the pathway activity scores into different states, such as positive and negative. Here, we use Gaussian mixture model (GMM) to binarize the pathway activity scores.
# binarization
label_data <- doBinarization(score_data, n.cluster = 2, method = "GMM")
number of iterations= 10
head(label_data)
B_cell_activation.label
AAACATACAACCAC-1 negative
AAACATTGAGCTAC-1 positive
AAACATTGATCAGC-1 negative
AAACCGTGCTTCCG-1 negative
AAACCGTGTATGCG-1 negative
AAACGCACTGGTAC-1 negative
threshold <- min(score_data[rownames(label_data[label_data['B_cell_activation.label'] == 'positive', ]), 'B_cell_activation'])
ggplot(score_data, aes(x = B_cell_activation)) +
geom_histogram(bins = 50, fill = "grey", alpha = 0.7) +
geom_vline(xintercept = threshold, color = "red", linetype = "solid") +
annotate("text", x = threshold, y = 0.2, label = paste0("x=", signif(threshold, 2)), hjust = 0, vjust = 0, color = "black", size = 5) +
xlab("Activity score") + ylab("Frequency") + theme_minimal()
# visualization of cell states
object@meta.data$B_cell_activation_label <- label_data[colnames(object), ]
DimPlot(object, group.by = "B_cell_activation_label")
We may further evaluate the cell-type specificity using Area Under the Receiver Operating Characteristic Curve (AUC-ROC) value. Here, we use the pre-annotated celltype information to measure whether B_cell_activation is specific to B cells or not.
annotation_file <- "data/pbmc3k_seurat_annotation.txt"
seurat_annotations <- read.table(annotation_file, header = TRUE, row.names = 1, sep = "\t")
# Plot the pre-annotated celltype in UMAP
object@meta.data$seurat_annotations <- seurat_annotations[colnames(object), ]
DimPlot(object, group.by = "seurat_annotations")
# Distribution of B cell activation score in different cell types
score_data_withanno <- cbind(score_data, seurat_annotations)
ggplot(score_data_withanno, aes(x = seurat_annotations, y = B_cell_activation, color = seurat_annotations)) +
geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2, size = 0.01) + theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
theme(plot.title = element_text(hjust = 0.5)) + theme(legend.title=element_blank()) +
theme(legend.position="none") + theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45, size = 12))
# Calculating the AUC-ROC value
label <- rep('others', nrow(score_data_withanno))
label[score_data_withanno$seurat_annotations == "B"] <- "B"
roc_empirical <- rocit(score = score_data_withanno$B_cell_activation, class = label, negref = "others")
title <- paste0('AUC:', strsplit(as.character(summary(roc_empirical)[4,]), ':')[[1]][2])
Method used: empirical
Number of positive(s): 344
Number of negative(s): 2356
Area under curve: 0.9981
R version 4.4.3 (2025-02-28)
Platform: x86_64-apple-darwin20
Running under: macOS Ventura 13.7.4
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_us.utf-8/en_us.utf-8/en_us.utf-8/C/en_us.utf-8/en_us.utf-8
time zone: Asia/Shanghai
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods
[7] base
other attached packages:
[1] future_1.49.0 ROCit_2.1.2 ggplot2_3.5.2
[4] Seurat_5.2.1 SeuratObject_5.0.2 sp_2.2-0
[7] PaaSc_1.0.0 dplyr_1.1.4 kableExtra_1.4.0
[10] knitr_1.50 rmarkdown_2.29
loaded via a namespace (and not attached):
[1] RcppAnnoy_0.0.22 splines_4.4.3
[3] later_1.4.1 R.oo_1.27.0
[5] tibble_3.2.1 polyclip_1.10-7
[7] fastDummies_1.7.5 lifecycle_1.0.4
[9] globals_0.18.0 lattice_0.22-6
[11] MASS_7.3-65 magrittr_2.0.3
[13] plotly_4.10.4 sass_0.4.9
[15] jquerylib_0.1.4 yaml_2.3.10
[17] httpuv_1.6.15 sctransform_0.4.1
[19] askpass_1.2.1 spam_2.11-1
[21] spatstat.sparse_3.1-0 reticulate_1.42.0
[23] cowplot_1.1.3 pbapply_1.7-2
[25] RColorBrewer_1.1-3 abind_1.4-8
[27] zlibbioc_1.52.0 Rtsne_0.17
[29] GenomicRanges_1.58.0 R.utils_2.13.0
[31] mixtools_2.0.0.1 purrr_1.0.4
[33] downlit_0.4.4 BiocGenerics_0.52.0
[35] GenomeInfoDbData_1.2.13 IRanges_2.40.1
[37] S4Vectors_0.44.0 ggrepel_0.9.6
[39] irlba_2.3.5.1 listenv_0.9.1
[41] spatstat.utils_3.1-4 umap_0.2.10.0
[43] goftest_1.2-3 RSpectra_0.16-2
[45] spatstat.random_3.4-1 fitdistrplus_1.2-2
[47] parallelly_1.45.0 svglite_2.1.3
[49] codetools_0.2-20 DelayedArray_0.32.0
[51] scuttle_1.16.0 xml2_1.3.8
[53] tidyselect_1.2.1 UCSC.utils_1.2.0
[55] distill_1.6 farver_2.1.2
[57] viridis_0.6.5 ScaledMatrix_1.14.0
[59] matrixStats_1.5.0 stats4_4.4.3
[61] spatstat.explore_3.4-3 jsonlite_2.0.0
[63] BiocNeighbors_2.0.1 progressr_0.15.1
[65] ggridges_0.5.6 survival_3.8-3
[67] scater_1.34.1 systemfonts_1.2.3
[69] tictoc_1.2.1 segmented_2.1-4
[71] tools_4.4.3 ica_1.0-3
[73] Rcpp_1.0.14 glue_1.8.0
[75] gridExtra_2.3 SparseArray_1.6.2
[77] xfun_0.52 MatrixGenerics_1.18.1
[79] GenomeInfoDb_1.42.3 withr_3.0.2
[81] fastmap_1.2.0 fansi_1.0.6
[83] openssl_2.3.2 digest_0.6.37
[85] rsvd_1.0.5 R6_2.6.1
[87] mime_0.12 colorspace_2.1-1
[89] scattermore_1.2 tensor_1.5
[91] dichromat_2.0-0.1 spatstat.data_3.1-6
[93] R.methodsS3_1.8.2 tidyr_1.3.1
[95] generics_0.1.4 data.table_1.17.0
[97] httr_1.4.7 htmlwidgets_1.6.4
[99] S4Arrays_1.6.0 uwot_0.2.3
[101] pkgconfig_2.0.3 gtable_0.3.6
[103] lmtest_0.9-40 SingleCellExperiment_1.28.1
[105] XVector_0.46.0 htmltools_0.5.8.1
[107] dotCall64_1.2 bookdown_0.42
[109] fgsea_1.32.0 scales_1.4.0
[111] Biobase_2.66.0 png_0.1-8
[113] spatstat.univar_3.1-3 rstudioapi_0.17.1
[115] reshape2_1.4.4 nlme_3.1-167
[117] cachem_1.1.0 zoo_1.8-13
[119] stringr_1.5.1 KernSmooth_2.23-26
[121] vipor_0.4.7 parallel_4.4.3
[123] miniUI_0.1.1.1 pillar_1.10.1
[125] grid_4.4.3 vctrs_0.6.5
[127] RANN_2.6.2 promises_1.3.2
[129] BiocSingular_1.22.0 beachmat_2.22.0
[131] xtable_1.8-4 cluster_2.1.8
[133] beeswarm_0.4.0 CelliD_1.14.0
[135] evaluate_1.0.3 cli_3.6.5
[137] compiler_4.4.3 rlang_1.1.6
[139] crayon_1.5.3 future.apply_1.11.3
[141] labeling_0.4.3 RcppArmadillo_14.2.3-1
[143] plyr_1.8.9 ggbeeswarm_0.7.2
[145] stringi_1.8.7 viridisLite_0.4.2
[147] deldir_2.0-4 BiocParallel_1.40.0
[149] lazyeval_0.2.2 spatstat.geom_3.4-1
[151] Matrix_1.7-3 RcppHNSW_0.6.0
[153] patchwork_1.3.0 shiny_1.10.0
[155] SummarizedExperiment_1.36.0 kernlab_0.9-33
[157] ROCR_1.0-11 fontawesome_0.5.3
[159] igraph_2.1.4 memoise_2.0.1
[161] bslib_0.9.0 fastmatch_1.1-6