Bioinformatics training course: scRNA-seq: 使用Seurat进行多样本分析

全局设定

加载R包：

library(ggplot2)
library(data.table)
library("dplyr")
library(Seurat)
library(harmony)
library(reticulate)
library(patchwork)

读入数据

readfile2seurat <- function(file_path, samplename) {
    mtx <- fread(file_path)
    rownames(mtx) <- mtx$V1
    mtx$V1 <- NULL
    obj <- CreateSeuratObject(counts = mtx, min.cell = 0)
    obj$orig.ident <- samplename
    return(obj)
}

gzFiles = list.files("GSE129516_RAW_mouse/")
gzFiles

[1] "GSM3714747_chow1_filtered_gene_bc_matrices.csv.gz"
[2] "GSM3714748_chow2_filtered_gene_bc_matrices.csv.gz"
[3] "GSM3714749_chow3_filtered_gene_bc_matrices.csv.gz"
[4] "GSM3714750_nash1_filtered_gene_bc_matrices.csv.gz"
[5] "GSM3714751_nash2_filtered_gene_bc_matrices.csv.gz"
[6] "GSM3714752_nash3_filtered_gene_bc_matrices.csv.gz"

for(i in 1:6){
  input = paste0('GSE129516_RAW_mouse/', gzFiles[i])
  samplename = strsplit(gzFiles[i], '_')[[1]][2]
  cat('Loading sample', samplename, '\n')
  assign(samplename, readfile2seurat(input, samplename))
}

Loading sample chow1 
Loading sample chow2 
Loading sample chow3 
Loading sample nash1 
Loading sample nash2 
Loading sample nash3

mouse_all <- merge(x = chow1, y = c(chow2, chow3, nash1, nash2, nash3), project = 'GSE129516_mouse')
mouse_all

An object of class Seurat 
19349 features across 33168 samples within 1 assay 
Active assay: RNA (19349 features, 0 variable features)
 6 layers present: counts.1, counts.2, counts.3, counts.4, counts.5, counts.6

Seurat

流程性处理

mouse_all[[]] %>% head

                   orig.ident nCount_RNA nFeature_RNA
AAACCTGAGAGTAATC_1      chow1       2337         1228
AAACCTGGTAAATGTG_1      chow1       1306          777
AAACGGGAGAAACGAG_1      chow1       3786         1784
AAACGGGAGGAGTACC_1      chow1       1726         1046
AAACGGGCACCACCAG_1      chow1       1345          826
AAACGGGTCAAGGCTT_1      chow1       1785         1008

mouse_all[[]] = mouse_all[[]] %>% mutate(sample = gsub('[1-3]', '', orig.ident))

mouse_all <- mouse_all %>% NormalizeData %>% FindVariableFeatures(nfeatures = 1000) %>% 
    ScaleData(features = rownames(mouse_all)) %>% RunPCA %>% FindNeighbors(dims = 1:25, verbose = F) %>% 
    FindClusters(resolution = 0.1, verbose = F) %>% RunTSNE(dims = 1:25, seed.use=2)

mouse_all[[]] %>% head

                   orig.ident nCount_RNA nFeature_RNA sample
AAACCTGAGAGTAATC_1      chow1       2337         1228   chow
AAACCTGGTAAATGTG_1      chow1       1306          777   chow
AAACGGGAGAAACGAG_1      chow1       3786         1784   chow
AAACGGGAGGAGTACC_1      chow1       1726         1046   chow
AAACGGGCACCACCAG_1      chow1       1345          826   chow
AAACGGGTCAAGGCTT_1      chow1       1785         1008   chow
                   RNA_snn_res.0.1 seurat_clusters
AAACCTGAGAGTAATC_1               0               0
AAACCTGGTAAATGTG_1               0               0
AAACGGGAGAAACGAG_1               0               0
AAACGGGAGGAGTACC_1               0               0
AAACGGGCACCACCAG_1               0               0
AAACGGGTCAAGGCTT_1               0               0

展示聚类结果

col_palatte <- c('#6965CE', '#EE5755', '#75E15A', '#D5CE85', '#9F33D9', '#9BE2CF', '#69A5C8', 
    '#FC9500', '#D846AF', '#CE95C4', '#0018F9')
general_marker <- c('Stab2', 'Cd3g', 'Csf1r', 'Ebf1', 'Irf8', 'Sox9', 'Apoc3', 'Top2a', 'Jchain', 'Dcn')

# 展示合并以后得分类结果
DimPlot(mouse_all, label = T, pt.size = 0.0001, cols = col_palatte)

# 也可分开展示两种状态
DimPlot(mouse_all, label = T, split.by= 'sample', pt.size = 0.0001, cols = col_palatte)

# 按照样本展示
options(repr.plot.width=15, repr.plot.height=5)
DimPlot(mouse_all, split.by= 'orig.ident', pt.size = 0.0001, cols = col_palatte)

细胞注释

FeaturePlot(mouse_all, features = general_marker, ncol=4, cols=c('grey95', 'red3'), label=T, pt.size=0.001)

根据marker细胞的表达进行分类：

new.cluster.ids <- c('Endo', 'T cell', 'Macrophage', 'B-cell', 'DC', 'Macrophage', 
    'Cholangiocyte', 'Hepatocyte', 'Dividing cell', 'Plasma cell', 'HSC')
names(new.cluster.ids) = levels(mouse_all)
mouse_all <- RenameIdents(mouse_all, new.cluster.ids)
mouse_all[["cellType"]] <- Idents(object = mouse_all)
head(mouse_all[[]])

                   orig.ident nCount_RNA nFeature_RNA sample
AAACCTGAGAGTAATC_1      chow1       2337         1228   chow
AAACCTGGTAAATGTG_1      chow1       1306          777   chow
AAACGGGAGAAACGAG_1      chow1       3786         1784   chow
AAACGGGAGGAGTACC_1      chow1       1726         1046   chow
AAACGGGCACCACCAG_1      chow1       1345          826   chow
AAACGGGTCAAGGCTT_1      chow1       1785         1008   chow
                   RNA_snn_res.0.1 seurat_clusters cellType
AAACCTGAGAGTAATC_1               0               0     Endo
AAACCTGGTAAATGTG_1               0               0     Endo
AAACGGGAGAAACGAG_1               0               0     Endo
AAACGGGAGGAGTACC_1               0               0     Endo
AAACGGGCACCACCAG_1               0               0     Endo
AAACGGGTCAAGGCTT_1               0               0     Endo

# 检查marker
VlnPlot(mouse_all, features = rev(general_marker), cols = rev(col_palatte), group.by = 'cellType', stack = T) + 
    theme(legend.position = 'None') + labs(title = NULL)

# 展示分类后的结果
DimPlot(mouse_all, label = F, split.by= 'sample', pt.size = 0.0001, cols = col_palatte)

批次校正

harmony能够对PCA分析的PC embeddings进行后续处理，从而整合不同批次的数据。在Seurat中能够方便的使用harmony。

mouse_harmony <- RunHarmony(mouse_all, "orig.ident") # 指定批次
mouse_harmony <- mouse_harmony %>% RunTSNE(dims = 1:25, reduction='harmony', seed.use=2)

# 展示校正后的结果
DimPlot(mouse_harmony, label = T, pt.size = 0.0001, cols = col_palatte)

# 显示所有样本
DimPlot(mouse_harmony, group.by = 'cellType', split.by= 'orig.ident', pt.size = 0.0001, cols = col_palatte)

FeaturePlot(mouse_harmony, features = general_marker, ncol=4, cols=c('grey95', 'red3'), label=F, pt.size=0.001)

saveRDS(mouse_harmony, "output/mouse_harmony.RDS")