Pathway Activity Analysis for Single Cells

loading packages

library("Seurat")
library("ggplot2")
library("PaaSc")
library("ROCit")

Data preparation

As an example, we use the pbmc3k data from SeuratData, with details shown in pbmc3k_tutorial. For convenience, we provide the data in the PaaSc package.

data <- Read10X(data.dir = "data/pbmc3k_filtered_gene_bc_matrices/hg19")
object <- CreateSeuratObject(counts = data, min.cells = 3, min.features = 200)
object

An object of class Seurat 
13714 features across 2700 samples within 1 assay 
Active assay: RNA (13714 features, 0 variable features)
 1 layer present: counts

Computing pathway activity scores

PaaSc computes pathway activity score for each pathway independently and requires a background defined as a collection of pathway in the database. In general, the pathway of interest should be part of the background. Take the B_cell_activation from Gene Ontology (GO) as an example, the background pathway sets can be downloaded from MSigDB.

# MCA
object <- computeMCA(object, nmcs = 20)

1.194 sec elapsed

17.916 sec elapsed

1.596 sec elapsed

# Building gene rate for pathway and background pathway sets
background_geneset <- "data/kegg_legacy.gmt"
B_cell_activation <- readLines("data/B_cell_activation.txt")
pathway_geneset <- list(B_cell_activation = B_cell_activation)
gene_rate <- getGeneRate(background.geneset = background_geneset, pathway.geneset = pathway_geneset)
head(gene_rate)

        background B_cell_activation
ABCA1  0.005376344                 0
ABCA10 0.005376344                 0
ABCA12 0.005376344                 0
ABCA13 0.005376344                 0
ABCA2  0.010752688                 0
ABCA3  0.005376344                 0

# Regression of gene loading (Y) and gene rate (X)
regression_data <- doRegression(object, gene.rate = gene_rate)
head(regression_data[, 1:5])

                                        mca_1      mca_2        mca_3
B_cell_activation.t                 1.5047288 0.09987953 -3.289811947
B_cell_activation.background.t      0.3260923 0.47468205 -0.012395800
B_cell_activation.pvalue            0.1324888 0.92044599  0.001012992
B_cell_activation.background.pvalue 0.7443751 0.63504478  0.990110579
                                            mca_4        mca_5
B_cell_activation.t                 -4.924049e+00 -2.724654808
B_cell_activation.background.t       6.320301e-01 -0.971835309
B_cell_activation.pvalue             8.889976e-07  0.006470545
B_cell_activation.background.pvalue  5.274106e-01  0.331203156

As shown above, mca_3,mca_4 and mca_5 are significantly associated with B_cell_activation pathway.

# Computing the pathway activity score
score_data <- computeScore(object, regression.data = regression_data, weight = FALSE, normalize = "z-score")
head(score_data)

                 B_cell_activation
AAACATACAACCAC-1       -0.34014887
AAACATTGAGCTAC-1        2.09314958
AAACATTGATCAGC-1       -1.10995620
AAACCGTGCTTCCG-1        0.04333788
AAACCGTGTATGCG-1        0.26093785
AAACGCACTGGTAC-1       -0.24407591

# visualization of pathway activity score using UMAP
object <- FindVariableFeatures(object, verbose = FALSE)
object <- ScaleData(object, verbose = FALSE)
object <- RunPCA(object, verbose = FALSE)
object <- FindNeighbors(object, verbose = FALSE)
object <- RunUMAP(object, dims = 1:20, verbose = FALSE)
# Ceiling score for better visualization
cscore <- score_data[colnames(object), 1]
cscore[cscore > quantile(cscore, 0.99)] = quantile(cscore, 0.99)
cscore[cscore < quantile(cscore, 0.01)] = quantile(cscore, 0.01)
object@meta.data$B_cell_activation <- cscore
FeaturePlot(object, features = "B_cell_activation", cols = c('white', 'darkred'))

Binarization and label the cell

Sometime, it’s helpful to binarize the pathway activity scores into different states, such as positive and negative. Here, we use Gaussian mixture model (GMM) to binarize the pathway activity scores.

# binarization
label_data <- doBinarization(score_data, n.cluster = 2, method = "GMM")

number of iterations= 10

head(label_data)

                 B_cell_activation.label
AAACATACAACCAC-1                negative
AAACATTGAGCTAC-1                positive
AAACATTGATCAGC-1                negative
AAACCGTGCTTCCG-1                negative
AAACCGTGTATGCG-1                negative
AAACGCACTGGTAC-1                negative

threshold <- min(score_data[rownames(label_data[label_data['B_cell_activation.label'] == 'positive', ]), 'B_cell_activation'])
ggplot(score_data, aes(x = B_cell_activation)) +
  geom_histogram(bins = 50, fill = "grey", alpha = 0.7) + 
  geom_vline(xintercept = threshold, color = "red", linetype = "solid") +
  annotate("text", x = threshold, y = 0.2, label = paste0("x=", signif(threshold, 2)), hjust = 0, vjust = 0, color = "black", size = 5) + 
  xlab("Activity score") + ylab("Frequency") + theme_minimal()

# visualization of cell states
object@meta.data$B_cell_activation_label <- label_data[colnames(object), ]
DimPlot(object, group.by = "B_cell_activation_label")

Evaluation of cell-type specificity

We may further evaluate the cell-type specificity using Area Under the Receiver Operating Characteristic Curve (AUC-ROC) value. Here, we use the pre-annotated celltype information to measure whether B_cell_activation is specific to B cells or not.

annotation_file <- "data/pbmc3k_seurat_annotation.txt"
seurat_annotations <- read.table(annotation_file, header = TRUE, row.names = 1, sep = "\t")

# Plot the pre-annotated celltype in UMAP
object@meta.data$seurat_annotations <- seurat_annotations[colnames(object), ]
DimPlot(object, group.by = "seurat_annotations")

# Distribution of B cell activation score in different cell types
score_data_withanno <- cbind(score_data, seurat_annotations)
ggplot(score_data_withanno, aes(x = seurat_annotations, y = B_cell_activation, color = seurat_annotations)) + 
    geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2, size = 0.01) + theme_bw() + 
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + 
    theme(plot.title = element_text(hjust = 0.5)) + theme(legend.title=element_blank()) + 
    theme(legend.position="none") +  theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45, size = 12))

# Calculating the AUC-ROC value
label <- rep('others', nrow(score_data_withanno))
label[score_data_withanno$seurat_annotations == "B"] <- "B"
roc_empirical <- rocit(score = score_data_withanno$B_cell_activation, class = label, negref = "others")
title <- paste0('AUC:', strsplit(as.character(summary(roc_empirical)[4,]), ':')[[1]][2])

                            
 Method used: empirical     
 Number of positive(s): 344 
 Number of negative(s): 2356
 Area under curve: 0.9981

plot(roc_empirical, YIndex = F, values = F, col = c(2,1))
text(0.1, 0.6, title)

sessionInfo()

sessionInfo()

R version 4.4.2 (2024-10-31)
Platform: x86_64-apple-darwin20
Running under: macOS Sequoia 15.3.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] C/UTF-8/C/C/C/C

time zone: Asia/Shanghai
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods  
[7] base     

other attached packages:
 [1] ROCit_2.1.2               PaaSc_1.0.0              
 [3] stxBrain.SeuratData_0.1.2 ifnb.SeuratData_3.1.0    
 [5] SeuratData_0.2.2.9001     Seurat_5.1.0             
 [7] SeuratObject_5.0.2        sp_2.1-4                 
 [9] msigdbr_7.5.1             reshape2_1.4.4           
[11] ggrepel_0.9.6             ggplot2_3.5.1            
[13] knitr_1.48                rmarkdown_2.28           

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.22            splines_4.4.2              
  [3] later_1.3.2                 R.oo_1.27.0                
  [5] tibble_3.2.1                polyclip_1.10-7            
  [7] fastDummies_1.7.4           lifecycle_1.0.4            
  [9] globals_0.16.3              lattice_0.22-6             
 [11] MASS_7.3-61                 magrittr_2.0.3             
 [13] plotly_4.10.4               sass_0.4.9                 
 [15] jquerylib_0.1.4             yaml_2.3.10                
 [17] httpuv_1.6.15               glmGamPoi_1.18.0           
 [19] sctransform_0.4.1           askpass_1.2.1              
 [21] spam_2.11-0                 spatstat.sparse_3.1-0      
 [23] reticulate_1.39.0           cowplot_1.1.3              
 [25] pbapply_1.7-2               RColorBrewer_1.1-3         
 [27] abind_1.4-8                 zlibbioc_1.52.0            
 [29] Rtsne_0.17                  GenomicRanges_1.58.0       
 [31] R.utils_2.12.3              mixtools_2.0.0             
 [33] purrr_1.0.2                 downlit_0.4.4              
 [35] BiocGenerics_0.52.0         rappdirs_0.3.3             
 [37] GenomeInfoDbData_1.2.13     IRanges_2.40.0             
 [39] S4Vectors_0.44.0            irlba_2.3.5.1              
 [41] listenv_0.9.1               spatstat.utils_3.1-0       
 [43] umap_0.2.10.0               goftest_1.2-3              
 [45] RSpectra_0.16-2             spatstat.random_3.3-2      
 [47] fitdistrplus_1.2-1          parallelly_1.38.0          
 [49] DelayedMatrixStats_1.28.0   leiden_0.4.3.1             
 [51] codetools_0.2-20            DelayedArray_0.32.0        
 [53] scuttle_1.16.0              tidyselect_1.2.1           
 [55] UCSC.utils_1.2.0            distill_1.6                
 [57] farver_2.1.2                viridis_0.6.5              
 [59] ScaledMatrix_1.14.0         matrixStats_1.4.1          
 [61] stats4_4.4.2                spatstat.explore_3.3-3     
 [63] jsonlite_1.8.9              BiocNeighbors_2.0.0        
 [65] progressr_0.15.0            ggridges_0.5.6             
 [67] survival_3.7-0              scater_1.34.0              
 [69] tictoc_1.2.1                segmented_2.1-3            
 [71] tools_4.4.2                 ica_1.0-3                  
 [73] Rcpp_1.0.13-1               glue_1.8.0                 
 [75] gridExtra_2.3               SparseArray_1.6.0          
 [77] xfun_0.49                   MatrixGenerics_1.18.0      
 [79] GenomeInfoDb_1.42.0         dplyr_1.1.4                
 [81] withr_3.0.2                 fastmap_1.2.0              
 [83] fansi_1.0.6                 openssl_2.2.2              
 [85] rsvd_1.0.5                  digest_0.6.37              
 [87] R6_2.5.1                    mime_0.12                  
 [89] colorspace_2.1-1            scattermore_1.2            
 [91] tensor_1.5                  spatstat.data_3.1-2        
 [93] R.methodsS3_1.8.2           utf8_1.2.4                 
 [95] tidyr_1.3.1                 generics_0.1.3             
 [97] data.table_1.16.2           httr_1.4.7                 
 [99] htmlwidgets_1.6.4           S4Arrays_1.6.0             
[101] uwot_0.2.2                  pkgconfig_2.0.3            
[103] gtable_0.3.6                lmtest_0.9-40              
[105] SingleCellExperiment_1.28.0 XVector_0.46.0             
[107] htmltools_0.5.8.1           dotCall64_1.2              
[109] bookdown_0.41               fgsea_1.32.0               
[111] scales_1.3.0                Biobase_2.66.0             
[113] png_0.1-8                   spatstat.univar_3.0-1      
[115] rstudioapi_0.17.1           nlme_3.1-166               
[117] cachem_1.1.0                zoo_1.8-12                 
[119] stringr_1.5.1               KernSmooth_2.23-24         
[121] vipor_0.4.7                 parallel_4.4.2             
[123] miniUI_0.1.1.1              pillar_1.9.0               
[125] grid_4.4.2                  vctrs_0.6.5                
[127] RANN_2.6.2                  promises_1.3.0             
[129] BiocSingular_1.22.0         beachmat_2.22.0            
[131] xtable_1.8-4                cluster_2.1.6              
[133] beeswarm_0.4.0              CelliD_1.14.0              
[135] evaluate_1.0.1              cli_3.6.3                  
[137] compiler_4.4.2              rlang_1.1.4                
[139] crayon_1.5.3                future.apply_1.11.3        
[141] labeling_0.4.3              RcppArmadillo_14.0.2-1     
[143] ggbeeswarm_0.7.2            plyr_1.8.9                 
[145] stringi_1.8.4               viridisLite_0.4.2          
[147] deldir_2.0-4                BiocParallel_1.40.0        
[149] babelgene_22.9              munsell_0.5.1              
[151] lazyeval_0.2.2              spatstat.geom_3.3-3        
[153] Matrix_1.7-1                RcppHNSW_0.6.0             
[155] patchwork_1.3.0             sparseMatrixStats_1.18.0   
[157] future_1.34.0               shiny_1.9.1                
[159] SummarizedExperiment_1.36.0 highr_0.11                 
[161] kernlab_0.9-33              ROCR_1.0-11                
[163] fontawesome_0.5.2           igraph_2.1.1               
[165] memoise_2.0.1               bslib_0.8.0                
[167] fastmatch_1.1-4