Seurat入门

全局设定

我们将使用系统内安装的R:

# bash命令
export PATH="/sibcb1/bioinformatics/biocourse/biocourse01/miniforge3/envs/scRNA/bin:$PATH"

加载R包:

读入数据

pbmc.data <- Read10X(data.dir="pbmc3k/filtered_gene_bc_matrices/hg19/")
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc
An object of class Seurat 
13714 features across 2700 samples within 1 assay 
Active assay: RNA (13714 features, 0 variable features)
 1 layer present: counts
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

获取细胞水平的metadata

# 显示所有的列
head(pbmc[[]])
                 orig.ident nCount_RNA nFeature_RNA percent.mt
AAACATACAACCAC-1     pbmc3k       2419          779  3.0177759
AAACATTGAGCTAC-1     pbmc3k       4903         1352  3.7935958
AAACATTGATCAGC-1     pbmc3k       3147         1129  0.8897363
AAACCGTGCTTCCG-1     pbmc3k       2639          960  1.7430845
AAACCGTGTATGCG-1     pbmc3k        980          521  1.2244898
AAACGCACTGGTAC-1     pbmc3k       2163          781  1.6643551
# 只显示特定的列
head(pbmc[[c('nFeature_RNA', 'percent.mt')]])
                 nFeature_RNA percent.mt
AAACATACAACCAC-1          779  3.0177759
AAACATTGAGCTAC-1         1352  3.7935958
AAACATTGATCAGC-1         1129  0.8897363
AAACCGTGCTTCCG-1          960  1.7430845
AAACCGTGTATGCG-1          521  1.2244898
AAACGCACTGGTAC-1          781  1.6643551
# 我们也能很方便的添加metadata
pbmc[['tmp']] = 1
head(pbmc[[]])
                 orig.ident nCount_RNA nFeature_RNA percent.mt tmp
AAACATACAACCAC-1     pbmc3k       2419          779  3.0177759   1
AAACATTGAGCTAC-1     pbmc3k       4903         1352  3.7935958   1
AAACATTGATCAGC-1     pbmc3k       3147         1129  0.8897363   1
AAACCGTGCTTCCG-1     pbmc3k       2639          960  1.7430845   1
AAACCGTGTATGCG-1     pbmc3k        980          521  1.2244898   1
AAACGCACTGGTAC-1     pbmc3k       2163          781  1.6643551   1

UMI counts

# layers
Layers(pbmc)
[1] "counts"
# Assay
Assays(pbmc)
[1] "RNA"
# 显示UMI counts
pbmc[['RNA']]$counts[1:10, 1:5]
10 x 5 sparse Matrix of class "dgCMatrix"
              AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1
AL627309.1                   .                .                .
AP006222.2                   .                .                .
RP11-206L10.2                .                .                .
RP11-206L10.9                .                .                .
LINC00115                    .                .                .
NOC2L                        .                .                .
KLHL17                       .                .                .
PLEKHN1                      .                .                .
RP11-54O7.17                 .                .                .
HES4                         .                .                .
              AAACCGTGCTTCCG-1 AAACCGTGTATGCG-1
AL627309.1                   .                .
AP006222.2                   .                .
RP11-206L10.2                .                .
RP11-206L10.9                .                .
LINC00115                    .                .
NOC2L                        .                .
KLHL17                       .                .
PLEKHN1                      .                .
RP11-54O7.17                 .                .
HES4                         .                .

观察两个变量之间的关系

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

过滤细胞和基因

# 过滤细胞,过滤掉的细胞和基因不参加任何后续分析
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
pbmc
An object of class Seurat 
13714 features across 2638 samples within 1 assay 
Active assay: RNA (13714 features, 0 variable features)
 1 layer present: counts

均一化

pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

# 保存一个单独的layer内
Layers(pbmc)
[1] "counts" "data"  
pbmc[['RNA']]$data[1:10, 1:5]
10 x 5 sparse Matrix of class "dgCMatrix"
              AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1
AL627309.1                   .                .                .
AP006222.2                   .                .                .
RP11-206L10.2                .                .                .
RP11-206L10.9                .                .                .
LINC00115                    .                .                .
NOC2L                        .                .                .
KLHL17                       .                .                .
PLEKHN1                      .                .                .
RP11-54O7.17                 .                .                .
HES4                         .                .                .
              AAACCGTGCTTCCG-1 AAACCGTGTATGCG-1
AL627309.1                   .                .
AP006222.2                   .                .
RP11-206L10.2                .                .
RP11-206L10.9                .                .
LINC00115                    .                .
NOC2L                        .                .
KLHL17                       .                .
PLEKHN1                      .                .
RP11-54O7.17                 .                .
HES4                         .                .

鉴定高可变基因

pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(pbmc), 10)
plot1 <- VariableFeaturePlot(pbmc)
LabelPoints(plot = plot1, points = top10, repel = TRUE)

Scaling

由于后续PCA分析的需要,我们需要对数据进行scaling,从而使得每个基因的均值为0,方差为1。默认情况下,只对高可变基因进行scaling。

all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
pbmc # 目前有三个layers
An object of class Seurat 
13714 features across 2638 samples within 1 assay 
Active assay: RNA (13714 features, 2000 variable features)
 3 layers present: counts, data, scale.data

降维

pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
ElbowPlot(pbmc, ndims = 40, reduction = "pca")

聚类

pbmc <- FindNeighbors(pbmc, dims = 1:10) # Based on shared nearest neighbor graph (SNN)
pbmc <- FindClusters(pbmc, resolution = 0.5) # 改变参数可以得到不同的cluster数目
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 95927

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8728
Number of communities: 9
Elapsed time: 0 seconds
head(pbmc[[]]) # 查看cluster信息
                 orig.ident nCount_RNA nFeature_RNA percent.mt tmp
AAACATACAACCAC-1     pbmc3k       2419          779  3.0177759   1
AAACATTGAGCTAC-1     pbmc3k       4903         1352  3.7935958   1
AAACATTGATCAGC-1     pbmc3k       3147         1129  0.8897363   1
AAACCGTGCTTCCG-1     pbmc3k       2639          960  1.7430845   1
AAACCGTGTATGCG-1     pbmc3k        980          521  1.2244898   1
AAACGCACTGGTAC-1     pbmc3k       2163          781  1.6643551   1
                 RNA_snn_res.0.5 seurat_clusters
AAACATACAACCAC-1               2               2
AAACATTGAGCTAC-1               3               3
AAACATTGATCAGC-1               2               2
AAACCGTGCTTCCG-1               1               1
AAACCGTGTATGCG-1               6               6
AAACGCACTGGTAC-1               2               2

可视化

pbmc <- RunUMAP(pbmc, dims = 1:10)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()

注释

基于cell marker的细胞注释

# 查看marker genes
FeaturePlot(pbmc, features = c('IL7R', 'CCR7', 'CD14', 'LYZ', 'S100A4', 'MS4A1', 'CD8A', 'FCGR3A', 'GNLY', 'FCER1A', "CST3", "PPBP"))

不同的免疫细胞有不同的marker genes,我们可以用这些marker genes来对不同细胞类型进行分类:

marker cellTyp
IL7R, CCR7 Naive T cells
CD14, LYZ CD14+ Monocytes
IL7R, S100A4 Memory CD4+ T cells
MS4A1 B
CD8A CD8+ T
FCGR3A, MS4A7 FCGR3A+ Monocytes
GNLY, NKG7 NK
FCER1A, CST3 DC
PPBP Platelet

根据上述信息,我们可以对不同细胞类型进行分类:

new.cluster.ids = c('Naive T cells', 'CD14+ Monocytes', 'Memory CD4+ T cells', 'B', 'CD8+ T', 'FCR3A+ Monocytes', 'NK', 'DC', 'Platelet')
names(new.cluster.ids) = levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5, label.size = 3) + NoLegend()

也可以把上述信息更新到metadata中:

pbmc[["cellType"]] <- Idents(object = pbmc)
head(pbmc[[]])
                 orig.ident nCount_RNA nFeature_RNA percent.mt tmp
AAACATACAACCAC-1     pbmc3k       2419          779  3.0177759   1
AAACATTGAGCTAC-1     pbmc3k       4903         1352  3.7935958   1
AAACATTGATCAGC-1     pbmc3k       3147         1129  0.8897363   1
AAACCGTGCTTCCG-1     pbmc3k       2639          960  1.7430845   1
AAACCGTGTATGCG-1     pbmc3k        980          521  1.2244898   1
AAACGCACTGGTAC-1     pbmc3k       2163          781  1.6643551   1
                 RNA_snn_res.0.5 seurat_clusters            cellType
AAACATACAACCAC-1               2               2 Memory CD4+ T cells
AAACATTGAGCTAC-1               3               3                   B
AAACATTGATCAGC-1               2               2 Memory CD4+ T cells
AAACCGTGCTTCCG-1               1               1     CD14+ Monocytes
AAACCGTGTATGCG-1               6               6                  NK
AAACGCACTGGTAC-1               2               2 Memory CD4+ T cells

差异表达分析

pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, test.use = "wilcox")
head(pbmc.markers, 10)
              p_val avg_log2FC pct.1 pct.2     p_val_adj
RPS12 1.273332e-143  0.7387061 1.000 0.991 1.746248e-139
RPS6  6.817653e-143  0.6934523 1.000 0.995 9.349729e-139
RPS27 4.661810e-141  0.7372604 0.999 0.992 6.393206e-137
RPL32 8.158412e-138  0.6266075 0.999 0.995 1.118845e-133
RPS14 5.177478e-130  0.6336957 1.000 0.994 7.100394e-126
RPS25 3.244898e-123  0.7689940 0.997 0.975 4.450053e-119
RPL31 2.073444e-119  0.7757544 0.997 0.964 2.843521e-115
RPL9  1.510863e-118  0.7714338 0.997 0.971 2.071998e-114
RPL13 1.216051e-114  0.5659877 1.000 0.996 1.667693e-110
LDHB  3.746131e-112  1.2060189 0.912 0.592 5.137444e-108
            cluster  gene
RPS12 Naive T cells RPS12
RPS6  Naive T cells  RPS6
RPS27 Naive T cells RPS27
RPL32 Naive T cells RPL32
RPS14 Naive T cells RPS14
RPS25 Naive T cells RPS25
RPL31 Naive T cells RPL31
RPL9  Naive T cells  RPL9
RPL13 Naive T cells RPL13
LDHB  Naive T cells  LDHB
pbmc.markers %>%
    group_by(cluster) %>%
    dplyr::filter(avg_log2FC > 1) %>%
    slice_head(n = 10) %>%
    ungroup() -> top10
DoHeatmap(pbmc, features = top10$gene) + NoLegend()

数据保存

saveRDS(pbmc, "pbmc.RDS")