我们将使用系统内安装的R:
# bash命令
export PATH="/sibcb1/bioinformatics/biocourse/biocourse01/miniforge3/envs/scRNA/bin:$PATH"
加载R包:
pbmc.data <- Read10X(data.dir="pbmc3k/filtered_gene_bc_matrices/hg19/")
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc
An object of class Seurat
13714 features across 2700 samples within 1 assay
Active assay: RNA (13714 features, 0 variable features)
1 layer present: counts
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
# 显示所有的列
head(pbmc[[]])
orig.ident nCount_RNA nFeature_RNA percent.mt
AAACATACAACCAC-1 pbmc3k 2419 779 3.0177759
AAACATTGAGCTAC-1 pbmc3k 4903 1352 3.7935958
AAACATTGATCAGC-1 pbmc3k 3147 1129 0.8897363
AAACCGTGCTTCCG-1 pbmc3k 2639 960 1.7430845
AAACCGTGTATGCG-1 pbmc3k 980 521 1.2244898
AAACGCACTGGTAC-1 pbmc3k 2163 781 1.6643551
nFeature_RNA percent.mt
AAACATACAACCAC-1 779 3.0177759
AAACATTGAGCTAC-1 1352 3.7935958
AAACATTGATCAGC-1 1129 0.8897363
AAACCGTGCTTCCG-1 960 1.7430845
AAACCGTGTATGCG-1 521 1.2244898
AAACGCACTGGTAC-1 781 1.6643551
# 我们也能很方便的添加metadata
pbmc[['tmp']] = 1
head(pbmc[[]])
orig.ident nCount_RNA nFeature_RNA percent.mt tmp
AAACATACAACCAC-1 pbmc3k 2419 779 3.0177759 1
AAACATTGAGCTAC-1 pbmc3k 4903 1352 3.7935958 1
AAACATTGATCAGC-1 pbmc3k 3147 1129 0.8897363 1
AAACCGTGCTTCCG-1 pbmc3k 2639 960 1.7430845 1
AAACCGTGTATGCG-1 pbmc3k 980 521 1.2244898 1
AAACGCACTGGTAC-1 pbmc3k 2163 781 1.6643551 1
# layers
Layers(pbmc)
[1] "counts"
# Assay
Assays(pbmc)
[1] "RNA"
# 显示UMI counts
pbmc[['RNA']]$counts[1:10, 1:5]
10 x 5 sparse Matrix of class "dgCMatrix"
AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1
AL627309.1 . . .
AP006222.2 . . .
RP11-206L10.2 . . .
RP11-206L10.9 . . .
LINC00115 . . .
NOC2L . . .
KLHL17 . . .
PLEKHN1 . . .
RP11-54O7.17 . . .
HES4 . . .
AAACCGTGCTTCCG-1 AAACCGTGTATGCG-1
AL627309.1 . .
AP006222.2 . .
RP11-206L10.2 . .
RP11-206L10.9 . .
LINC00115 . .
NOC2L . .
KLHL17 . .
PLEKHN1 . .
RP11-54O7.17 . .
HES4 . .
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
# 过滤细胞,过滤掉的细胞和基因不参加任何后续分析
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
pbmc
An object of class Seurat
13714 features across 2638 samples within 1 assay
Active assay: RNA (13714 features, 0 variable features)
1 layer present: counts
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
# 保存一个单独的layer内
Layers(pbmc)
[1] "counts" "data"
pbmc[['RNA']]$data[1:10, 1:5]
10 x 5 sparse Matrix of class "dgCMatrix"
AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1
AL627309.1 . . .
AP006222.2 . . .
RP11-206L10.2 . . .
RP11-206L10.9 . . .
LINC00115 . . .
NOC2L . . .
KLHL17 . . .
PLEKHN1 . . .
RP11-54O7.17 . . .
HES4 . . .
AAACCGTGCTTCCG-1 AAACCGTGTATGCG-1
AL627309.1 . .
AP006222.2 . .
RP11-206L10.2 . .
RP11-206L10.9 . .
LINC00115 . .
NOC2L . .
KLHL17 . .
PLEKHN1 . .
RP11-54O7.17 . .
HES4 . .
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(pbmc), 10)
plot1 <- VariableFeaturePlot(pbmc)
LabelPoints(plot = plot1, points = top10, repel = TRUE)
由于后续PCA分析的需要,我们需要对数据进行scaling,从而使得每个基因的均值为0,方差为1。默认情况下,只对高可变基因进行scaling。
An object of class Seurat
13714 features across 2638 samples within 1 assay
Active assay: RNA (13714 features, 2000 variable features)
3 layers present: counts, data, scale.data
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
ElbowPlot(pbmc, ndims = 40, reduction = "pca")
pbmc <- FindNeighbors(pbmc, dims = 1:10) # Based on shared nearest neighbor graph (SNN)
pbmc <- FindClusters(pbmc, resolution = 0.5) # 改变参数可以得到不同的cluster数目
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 95927
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8728
Number of communities: 9
Elapsed time: 0 seconds
head(pbmc[[]]) # 查看cluster信息
orig.ident nCount_RNA nFeature_RNA percent.mt tmp
AAACATACAACCAC-1 pbmc3k 2419 779 3.0177759 1
AAACATTGAGCTAC-1 pbmc3k 4903 1352 3.7935958 1
AAACATTGATCAGC-1 pbmc3k 3147 1129 0.8897363 1
AAACCGTGCTTCCG-1 pbmc3k 2639 960 1.7430845 1
AAACCGTGTATGCG-1 pbmc3k 980 521 1.2244898 1
AAACGCACTGGTAC-1 pbmc3k 2163 781 1.6643551 1
RNA_snn_res.0.5 seurat_clusters
AAACATACAACCAC-1 2 2
AAACATTGAGCTAC-1 3 3
AAACATTGATCAGC-1 2 2
AAACCGTGCTTCCG-1 1 1
AAACCGTGTATGCG-1 6 6
AAACGCACTGGTAC-1 2 2
# 查看marker genes
FeaturePlot(pbmc, features = c('IL7R', 'CCR7', 'CD14', 'LYZ', 'S100A4', 'MS4A1', 'CD8A', 'FCGR3A', 'GNLY', 'FCER1A', "CST3", "PPBP"))
不同的免疫细胞有不同的marker genes,我们可以用这些marker genes来对不同细胞类型进行分类:
marker | cellTyp |
---|---|
IL7R, CCR7 | Naive T cells |
CD14, LYZ | CD14+ Monocytes |
IL7R, S100A4 | Memory CD4+ T cells |
MS4A1 | B |
CD8A | CD8+ T |
FCGR3A, MS4A7 | FCGR3A+ Monocytes |
GNLY, NKG7 | NK |
FCER1A, CST3 | DC |
PPBP | Platelet |
根据上述信息,我们可以对不同细胞类型进行分类:
new.cluster.ids = c('Naive T cells', 'CD14+ Monocytes', 'Memory CD4+ T cells', 'B', 'CD8+ T', 'FCR3A+ Monocytes', 'NK', 'DC', 'Platelet')
names(new.cluster.ids) = levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5, label.size = 3) + NoLegend()
也可以把上述信息更新到metadata中:
orig.ident nCount_RNA nFeature_RNA percent.mt tmp
AAACATACAACCAC-1 pbmc3k 2419 779 3.0177759 1
AAACATTGAGCTAC-1 pbmc3k 4903 1352 3.7935958 1
AAACATTGATCAGC-1 pbmc3k 3147 1129 0.8897363 1
AAACCGTGCTTCCG-1 pbmc3k 2639 960 1.7430845 1
AAACCGTGTATGCG-1 pbmc3k 980 521 1.2244898 1
AAACGCACTGGTAC-1 pbmc3k 2163 781 1.6643551 1
RNA_snn_res.0.5 seurat_clusters cellType
AAACATACAACCAC-1 2 2 Memory CD4+ T cells
AAACATTGAGCTAC-1 3 3 B
AAACATTGATCAGC-1 2 2 Memory CD4+ T cells
AAACCGTGCTTCCG-1 1 1 CD14+ Monocytes
AAACCGTGTATGCG-1 6 6 NK
AAACGCACTGGTAC-1 2 2 Memory CD4+ T cells
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, test.use = "wilcox")
head(pbmc.markers, 10)
p_val avg_log2FC pct.1 pct.2 p_val_adj
RPS12 1.273332e-143 0.7387061 1.000 0.991 1.746248e-139
RPS6 6.817653e-143 0.6934523 1.000 0.995 9.349729e-139
RPS27 4.661810e-141 0.7372604 0.999 0.992 6.393206e-137
RPL32 8.158412e-138 0.6266075 0.999 0.995 1.118845e-133
RPS14 5.177478e-130 0.6336957 1.000 0.994 7.100394e-126
RPS25 3.244898e-123 0.7689940 0.997 0.975 4.450053e-119
RPL31 2.073444e-119 0.7757544 0.997 0.964 2.843521e-115
RPL9 1.510863e-118 0.7714338 0.997 0.971 2.071998e-114
RPL13 1.216051e-114 0.5659877 1.000 0.996 1.667693e-110
LDHB 3.746131e-112 1.2060189 0.912 0.592 5.137444e-108
cluster gene
RPS12 Naive T cells RPS12
RPS6 Naive T cells RPS6
RPS27 Naive T cells RPS27
RPL32 Naive T cells RPL32
RPS14 Naive T cells RPS14
RPS25 Naive T cells RPS25
RPL31 Naive T cells RPL31
RPL9 Naive T cells RPL9
RPL13 Naive T cells RPL13
LDHB Naive T cells LDHB
saveRDS(pbmc, "pbmc.RDS")