Pseudotime analysis using Monocle2

loading packages¶

library(Seurat)
library(monocle)
library(data.table)

Loading required package: SeuratObject

Loading required package: sp


Attaching package: ‘SeuratObject’


The following objects are masked from ‘package:base’:

    intersect, saveRDS


Loading Seurat v5 beta version 
To maintain compatibility with previous workflows, new Seurat objects will use the previous object structure by default
To use new Seurat v5 assays: Please run: options(Seurat.object.assay.version = 'v5')

Loading required package: Matrix

Loading required package: Biobase

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following object is masked from ‘package:SeuratObject’:

    intersect


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


Loading required package: ggplot2

Loading required package: VGAM

Loading required package: stats4

Loading required package: splines

Loading required package: DDRTree

Loading required package: irlba

load data¶

project <- readRDS("data/PbmcRDS/pbmc.rds")

project

An object of class Seurat 
13714 features across 2638 samples within 1 assay 
Active assay: RNA (13714 features, 2000 variable features)
 3 layers present: counts, data, scale.data
 3 dimensional reductions calculated: pca, tsne, umap

DimPlot(project)

FeaturePlot(object = project, features = 'CD8B', label = TRUE, cols=c('grey95', 'red3'), pt.size=0.001)

features = c("CD8A", "LYZ", "CCL5", "IL32", "PTPRCAP", "FCGR3A", "PF4")
DotPlot(project, features = features) + RotatedAxis()

building new CDS object¶

data <- as(as.matrix(project@assays$RNA@counts),'sparseMatrix')
pd <- project@meta.data
fd <- data.frame(gene_short_name = row.names(project),row.names=row.names(project))

head(pd)

head(fd)

pd <- new("AnnotatedDataFrame",data=pd)
fd <- new("AnnotatedDataFrame",data=fd)
cds <- newCellDataSet(data, #expression data matrix for an experiment
                      phenoData = pd, #data frame containing attributes of individual cells
                      featureData = fd, #data frame containing attributes of features (e.g. genes)
                      lowerDetectionLimit = 0.5, #the minimum expression level that consistitutes true expression
                      expressionFamily = negbinomial.size()) #the VGAM family function to be used for expression response variables

cds

CellDataSet (storageMode: environment)
assayData: 13714 features, 2638 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: AAACATACAACCAC-1 AAACATTGAGCTAC-1 ... TTTGCATGCCTCAC-1
    (2638 total)
  varLabels: orig.ident nCount_RNA ... Size_Factor (8 total)
  varMetadata: labelDescription
featureData
  featureNames: AL627309.1 AP006222.2 ... SRSF10.1 (13714 total)
  fvarLabels: gene_short_name
  fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
Annotation:

Estimating size factors and dispersions¶

cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)

Warning message:
“`group_by_()` was deprecated in dplyr 0.7.0.
ℹ Please use `group_by()` instead.
ℹ See vignette('programming') for more help
ℹ The deprecated feature was likely used in the monocle package.
  Please report the issue to the authors.”
Warning message:
“`select_()` was deprecated in dplyr 0.7.0.
ℹ Please use `select()` instead.
ℹ The deprecated feature was likely used in the monocle package.
  Please report the issue to the authors.”
Warning message:
“glm.fit: algorithm did not converge”
Removing 276 outliers

Filtering low-quality cells Recommended¶

cds <- detectGenes(cds, min_expr = 0.1)

head(fData(cds))

head(pData(cds))

Constructing Single Cell Trajectories¶

Step 1: choosing genes that define progress¶

des <- FindAllMarkers(project)

Calculating cluster Naive CD4 T

Calculating cluster Memory CD4 T

Calculating cluster CD14+ Mono

Calculating cluster B

Calculating cluster CD8 T

Calculating cluster FCGR3A+ Mono

Calculating cluster NK

Calculating cluster DC

Calculating cluster Platelet

head(des)

diff <- subset(des[grep("^RP[L|S]",des$gene, ignore.case = FALSE,invert=TRUE),],subset=avg_log2FC>0.25 & pct.1 > 0.25 & (pct.1 > pct.2) & p_val < 0.05)

dim(diff)

head(diff)

write.table(diff,file="data/PbmcRDS/monocle_DEG.txt",col.names = T,row.names = F,sep="\t",quote = F)

ordergene <- as.vector(unique(diff$gene))
cds <- setOrderingFilter(cds,ordergene)

cds

CellDataSet (storageMode: environment)
assayData: 13714 features, 2638 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: AAACATACAACCAC-1 AAACATTGAGCTAC-1 ... TTTGCATGCCTCAC-1
    (2638 total)
  varLabels: orig.ident nCount_RNA ... num_genes_expressed (9 total)
  varMetadata: labelDescription
featureData
  featureNames: AL627309.1 AP006222.2 ... SRSF10.1 (13714 total)
  fvarLabels: gene_short_name num_cells_expressed use_for_ordering
  fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
Annotation:

plot_ordering_genes(cds)

Warning message:
“Transformation introduced infinite values in continuous y-axis”
Warning message:
“Transformation introduced infinite values in continuous y-axis”

Step 2: reducing the dimensionality of the data¶

cds <- reduceDimension(cds, max_components = 2, method = 'DDRTree')

Found more than one class "dist" in cache; using the first, from namespace 'spam'

Also defined by ‘BiocGenerics’

Found more than one class "dist" in cache; using the first, from namespace 'spam'

Also defined by ‘BiocGenerics’

step 3: order cells along the trajectory¶

cds = orderCells(cds)

Warning message in graph.dfs(dp_mst, root = root_cell, neimode = "all", unreachable = FALSE, :
“Argument `neimode' is deprecated; use `mode' instead”
Warning message in graph.dfs(dp_mst, root = root_cell, neimode = "all", unreachable = FALSE, :
“Argument `neimode' is deprecated; use `mode' instead”

#choose the root
#cds <- orderCells(cds,root_state=?)

Visualization¶

By Pseudotime¶

plot_cell_trajectory(cds,color_by="Pseudotime", size=1,show_backbone=TRUE)

By cell type¶

plot_cell_trajectory(cds,color_by="CellType", size=1,show_backbone=TRUE)

By cell State¶

plot_cell_trajectory(cds, color_by = "State",size=1,show_backbone=TRUE)

plot_cell_trajectory(cds, color_by = "CellType") + facet_wrap("~State", nrow = 1)

Displaying specific genes¶

keygenes <- head(ordergene,5)
plot_genes_in_pseudotime(cds[keygenes],color_by = "State")

plot_genes_in_pseudotime(cds[keygenes],color_by = "CellType")

Finding trajectory-associated genes¶

#fullModelFormulaStr = "~sm.ns(Pseudotime)"  "Pseudotime","CellType","Cluster"

head(pData(cds))

Deg_Pseudotime <- differentialGeneTest(cds[ordergene,],cores = 1,fullModelFormulaStr = "~ Pseudotime")

Deg_Pseudotime <- Deg_Pseudotime[order(Deg_Pseudotime$qval),]

head(Deg_Pseudotime)

write.table(Deg_Pseudotime, file = "data/PbmcRDS/Diff_pseudotime_heatmap.txt", sep = "\t",row.names = F)

# show single gene
pData(cds)$S100A9 =log2(exprs(cds)['S100A9',]+1)
plot_cell_trajectory(cds,color_by = "S100A9")

# heatmap
top50 = (Deg_Pseudotime$gene_short_name)[1:50]
plot_pseudotime_heatmap(cds[top50,],num_clusters = 4,cores = 1,show_rownames = T)

plot_genes_in_pseudotime(cds[top50[1:4],],nrow= 2,ncol = 2)

Branched expression analysis modeling (BEAM)¶

BEAM_res <- BEAM(cds, branch_point = 1, cores = 1)

BEAM_res <- BEAM_res[order(BEAM_res$qval),]

head(BEAM_res)

plot_genes_branched_heatmap(cds[row.names(BEAM_res[1:50,]),],branch_point = 1,
                            num_clusters =4,cores = 1,use_gene_short_name = T,
                            show_rownames = T, return_heatmap = F)

write.table(BEAM_res,file="data/PbmcRDS/branch1_genes_analysis.xls",
            sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)

save(list=c("cds","BEAM_res"),file = "data/PbmcRDS/monocle_PbmcRDS.RData")

	orig.ident	nCount_RNA	nFeature_RNA	percent.mt	RNA_snn_res.0.5	seurat_clusters	CellType
	<fct>	<dbl>	<int>	<dbl>	<fct>	<fct>	<fct>
AAACATACAACCAC-1	pbmc3k	2419	779	3.0177759	2	2	CD14+ Mono
AAACATTGAGCTAC-1	pbmc3k	4903	1352	3.7935958	3	3	B
AAACATTGATCAGC-1	pbmc3k	3147	1129	0.8897363	2	2	CD14+ Mono
AAACCGTGCTTCCG-1	pbmc3k	2639	960	1.7430845	1	1	Memory CD4 T
AAACCGTGTATGCG-1	pbmc3k	980	521	1.2244898	6	6	NK
AAACGCACTGGTAC-1	pbmc3k	2163	781	1.6643551	2	2	CD14+ Mono

	gene_short_name
	<chr>
AL627309.1	AL627309.1
AP006222.2	AP006222.2
RP11-206L10.2	RP11-206L10.2
RP11-206L10.9	RP11-206L10.9
LINC00115	LINC00115
NOC2L	NOC2L

	gene_short_name	num_cells_expressed
	<chr>	<int>
AL627309.1	AL627309.1	9
AP006222.2	AP006222.2	3
RP11-206L10.2	RP11-206L10.2	5
RP11-206L10.9	RP11-206L10.9	3
LINC00115	LINC00115	18
NOC2L	NOC2L	254

	orig.ident	nCount_RNA	nFeature_RNA	percent.mt	RNA_snn_res.0.5	seurat_clusters	CellType	Size_Factor	num_genes_expressed
	<fct>	<dbl>	<int>	<dbl>	<fct>	<fct>	<fct>	<dbl>	<int>
AAACATACAACCAC-1	pbmc3k	2419	779	3.0177759	2	2	CD14+ Mono	1.1076350	779
AAACATTGAGCTAC-1	pbmc3k	4903	1352	3.7935958	3	3	B	2.2450329	1352
AAACATTGATCAGC-1	pbmc3k	3147	1129	0.8897363	2	2	CD14+ Mono	1.4409787	1129
AAACCGTGCTTCCG-1	pbmc3k	2639	960	1.7430845	1	1	Memory CD4 T	1.2083708	960
AAACCGTGTATGCG-1	pbmc3k	980	521	1.2244898	6	6	NK	0.4487318	521
AAACGCACTGGTAC-1	pbmc3k	2163	781	1.6643551	2	2	CD14+ Mono	0.9904153	781

	p_val	avg_log2FC	pct.1	pct.2	p_val_adj	cluster	gene
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<fct>	<chr>
RPS12	1.273332e-143	0.7378080	1.000	0.991	1.746248e-139	Naive CD4 T	RPS12
RPS6	6.817653e-143	0.6928027	1.000	0.995	9.349729e-139	Naive CD4 T	RPS6
RPS27	4.661810e-141	0.7363321	0.999	0.992	6.393206e-137	Naive CD4 T	RPS27
RPL32	8.158412e-138	0.6258942	0.999	0.995	1.118845e-133	Naive CD4 T	RPL32
RPS14	5.177478e-130	0.6328349	1.000	0.994	7.100394e-126	Naive CD4 T	RPS14
CYBA	8.340652e-128	-1.7659602	0.659	0.913	1.143837e-123	Naive CD4 T	CYBA

A data.frame: 6 × 7
	p_val	avg_log2FC	pct.1	pct.2	p_val_adj	cluster	gene
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<fct>	<chr>
LDHB	3.746131e-112	1.1926289	0.912	0.592	5.137444e-108	Naive CD4 T	LDHB
EEF1A1	7.956185e-98	0.5351221	0.994	0.991	1.091111e-93	Naive CD4 T	EEF1A1
MALAT1	1.266916e-91	0.6495751	1.000	0.999	1.737448e-87	Naive CD4 T	MALAT1
CCR7	9.571984e-88	2.2144646	0.447	0.108	1.312702e-83	Naive CD4 T	CCR7
TPT1	1.801906e-85	0.6099004	0.997	0.982	2.471133e-81	Naive CD4 T	TPT1
CD3D	1.154695e-76	1.0495450	0.845	0.406	1.583548e-72	Naive CD4 T	CD3D

A data.frame: 6 × 7
	status	family	pval	qval	gene_short_name	num_cells_expressed	use_for_ordering
	<chr>	<chr>	<dbl>	<dbl>	<chr>	<int>	<lgl>
S100A9	OK	negbinomial.size	4.717768e-288	8.751459e-285	S100A9	943	TRUE
S100A8	OK	negbinomial.size	4.938210e-271	4.580190e-268	S100A8	730	TRUE
LYZ	OK	negbinomial.size	4.503770e-211	2.784831e-208	LYZ	1595	TRUE
CST3	OK	negbinomial.size	1.144560e-204	5.307895e-202	CST3	1050	TRUE
LGALS2	OK	negbinomial.size	6.539955e-193	2.426323e-190	LGALS2	564	TRUE
FCN1	OK	negbinomial.size	1.544015e-174	4.773578e-172	FCN1	782	TRUE

A data.frame: 6 × 7
	status	family	pval	qval	gene_short_name	num_cells_expressed	use_for_ordering
	<chr>	<chr>	<dbl>	<dbl>	<chr>	<int>	<lgl>
GZMB	OK	negbinomial.size	3.593340e-164	4.857117e-160	GZMB	318	TRUE
PPBP	OK	negbinomial.size	3.188004e-157	2.154613e-153	PPBP	76	TRUE
GNLY	OK	negbinomial.size	9.031612e-129	4.069343e-125	GNLY	475	TRUE
NKG7	OK	negbinomial.size	3.622942e-109	1.224283e-105	NKG7	788	TRUE
PF4	OK	negbinomial.size	1.198009e-101	3.238697e-98	PF4	41	TRUE
FGFBP2	OK	negbinomial.size	4.806126e-94	1.082740e-90	FGFBP2	290	TRUE