Seurat學習記錄

2021-02-23 生信微課
前言
最近雖然在調研單細胞，但是關於單細胞的數據尚未處理過。所以先根據網上的例子跑了一遍流程。於是開始動手，複製忍者卡卡西---寫輪眼技能發動。具體內容摘自Seurat官網，中文注釋來自博客（https://qiubio.com/）以及自己的註解，僅做記錄，並非完全原創。
目前最流行的R包是Seurat，基本能滿足研究者的需求，於是便以這個包來進行練手。具體例子來源於官網 https://satijalab.org/seurat/v3.2/pbmc3k_tutorial.html
Seurat 分析流程
library(Seurat)
packageVersion("Seurat")
library(dplyr)
1.導入10X GENOMICS數據library(ggplot2)
# 讀取Peripheral Blood Mononuclear Cells (PBMC)數據

pbmc.data <- Read10X(data.dir = "data/hg19")
# 初始化一個Seurat對象。
# 在初始化的時候，使用每個細胞表達的基因數不小於200，
# 計數基因表達在不少於3個細胞中做為初篩。
pbmc <- CreateSeuratObject(counts = pbmc.data, min.cells = 3, 
                           min.features = 200, project = "10X_PBMC")
pbmc
UMI計數表格對於拿到UMI計數表格的情況，構建一個dgTMatrix
# library(Seurat)
# library(Matrix)
# nUMI.summary.file <- "data/umi_expression_matrix.tsv"
# counts <- list()
# con <- file(nUMI.summary.file, open="r")
# ## 讀取文件頭
# header <- readLines(con, n=1, warn=FALSE)
# header <- strsplit(header, "\\t")[[1]]
# ## 讀取計數
# i <- 1
# while(length(buf <- readLines(con, n=1e6, warn=FALSE))>0){
#   buf <- as.matrix(read.delim(text=buf, header=FALSE, row.names=1))
#   counts[[i]] <- Matrix(buf, sparse=TRUE)
#   i <- i + 1
# }
# counts <- do.call(rbind, counts)
# rownames(counts)[1:5]
# colnames(counts) <- header
# ## 初始化一個Seurat對象
# d <- CreateSeuratObject(counts = counts, min.cells = 10,
#                         min.features = 200, project = "test_project")
2. Quality Control載入注釋library(rtracklayer)
gtf <- import("data/Homo_sapiens.GRCh38.102.chr.gtf.gz")
gtf <- gtf[gtf$gene_name!=""]
gtf <- gtf[!is.na(gtf$gene_name)]
## protein coding
protein <- 
  gtf$gene_name[gtf$transcript_biotype %in% 
                  c("IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_LV_gene", 
                    "IG_M_gene", "IG_V_gene", "IG_Z_gene", 
                    "nonsense_mediated_decay", "nontranslating_CDS", 
                    "non_stop_decay", "polymorphic_pseudogene", 
                    "protein_coding", "TR_C_gene", "TR_D_gene", "TR_gene", 
                    "TR_J_gene", "TR_V_gene")]
## mitochondrial genes
mito <- gtf$gene_name[as.character(seqnames(gtf)) %in% "MT"]
## long noncoding
lincRNA <- 
  gtf$gene_name[gtf$transcript_biotype %in% 
                  c("3prime_overlapping_ncrna", "ambiguous_orf", 
                    "antisense_RNA", "lincRNA", "ncrna_host", "non_coding", 
                    "processed_transcript", "retained_intron", 
                    "sense_intronic", "sense_overlapping")]
## short noncoding
sncRNA <- 
  gtf$gene_name[gtf$transcript_biotype %in% 
                  c("miRNA", "miRNA_pseudogene", "misc_RNA", 
                    "misc_RNA_pseudogene", "Mt_rRNA", "Mt_tRNA", 
                    "Mt_tRNA_pseudogene", "ncRNA", "pre_miRNA", 
                    "RNase_MRP_RNA", "RNase_P_RNA", "rRNA", "rRNA_pseudogene", 
                    "scRNA_pseudogene", "snlRNA", "snoRNA", 
                    "snRNA_pseudogene", "SRP_RNA", "tmRNA", "tRNA",
                    "tRNA_pseudogene", "ribozyme", "scaRNA", "sRNA")]
## pseudogene
pseudogene <- 
  gtf$gene_name[gtf$transcript_biotype %in% 
                  c("disrupted_domain", "IG_C_pseudogene", "IG_J_pseudogene", 
                    "IG_pseudogene", "IG_V_pseudogene", "processed_pseudogene", 
                    "pseudogene", "transcribed_processed_pseudogene",
                    "transcribed_unprocessed_pseudogene", 
                    "translated_processed_pseudogene", 
                    "translated_unprocessed_pseudogene", "TR_J_pseudogene", 
                    "TR_V_pseudogene", "unitary_pseudogene", 
                    "unprocessed_pseudogene")]
annotations <- list(protein=unique(protein), 
                    mito=unique(mito),
                    lincRNA=unique(lincRNA),
                    sncRNA=unique(sncRNA),
                    pseudogene=unique(pseudogene))
annotations <- annotations[lengths(annotations)>0]
質量控制與細胞選取# Seurat會計算基因數以及UMI數 (nFeature and nCount).
# Seurat會將原始數據保存在RNA slot中，
# 每一行對應一個基因，每一列對應一個細胞.
slotNames(pbmc)
Assays(pbmc)  ## == names(pbmc@assays)
DefaultAssay(pbmc) ## 查看當前默認的assay。
head(Idents(pbmc)) ## 查看一下當前的細胞分組。
# 在計算比例時，使用目標基因中的數值除以總的數值。
# `[[`運算符可以給metadata加column.
percent <- lapply(annotations, function(.ele){
  PercentageFeatureSet(pbmc, features = rownames(pbmc)[rownames(pbmc) %in% .ele])
})

# AddMetaData 會在object@meta.data中加一列。這些信息都會在QC中使用到。
for(i in seq_along(percent)){
  pbmc[[paste0("percent.", names(percent)[i])]] <- percent[[i]]
}


# AddMetaData也可以直接通過改變pbmc@meta.data來進行.

id <- sample(c("Group_A", "Group_B","Group_C"), nrow(pbmc@meta.data), replace = TRUE)
names(id) <- rownames(pbmc@meta.data) #必須有names
pbmc@meta.data$group <- id

head(pbmc@meta.data)
查看不同種類RNA percent
VlnPlot(object = pbmc, 
        features = c("nFeature_RNA", "nCount_RNA",
                      paste0("percent.", names(percent))), 
        ncol =# FeatureScatter是一個用來畫點圖的工具，可以比較不同的metadata。
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mito")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
# 從VlnPlot中，我們可以得到大多數細胞集中的範圍，通過這些信息，可以設置過濾條件。
# 比如可以使用unique gene counts, percent.protein, percentage.mito來進行過濾。
# 這裡我們設置了200 < nGene < 2500, 0.90 < percent.protein < Inf, -Inf < percent.mito < 0.05.
dim(pbmc)
過濾cell和featurepbmc <- subset(x = pbmc, 
               subset = nFeature_RNA > 200 & nFeature_RNA < 2500 &
                 percent.protein > 90 &
                 percent.mito < 5) 
dim(pbmc)
3. Normalization# 使用的是"logNormalize", 就是將每個基因的表達量對該細胞總表達量進行平衡，然後乘以一個因子(scale factor, 默認值為10,000), 然後中進行對數轉換。
pbmc <- NormalizeData(pbmc)
Variable genes
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", 
                             nfeatures = 2000)
# 找到變化最大的十個基因
top10 <- head(VariableFeatures(pbmc), 10)
top10

# 畫圖查看
plot1 <- VariableFeaturePlot(pbmc)
LabelPoints(plot = plot1, points = top10, repel = TRUE)
從上圖中，我們可以看到很多基因都表現出很大的差異性，但是這些差異性並不一定都是事實，有些可能是因為實驗，或者數據採集的方式導入的。因此人們想儘量的消除這些因素帶來的差異。這些差異有可能是因為batch-effect， 或者cell alignment rate，或者其它。這些因素都可以寫入metadata中，然後通過ScaleData函數來消除
all.genes <- rownames(pbmc)
pbmc <- ScaleData(object = pbmc, features = all.genes) # 校正batch等
head(pbmc[["RNA"]]@scale.data[, 1:3])
4. PCA analysispbmc <- RunPCA(object = pbmc, features = VariableFeatures(pbmc))
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
DimPlot(object = pbmc, reduction = "pca")
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(pbmc, dims = 1:12, cells = 500, balanced = TRUE)
# 使用jackStraw方法來估計應該使用多少個principal components的基因來進行下遊分析
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:15)
ElbowPlot(pbmc) # 得到cut.off值設置在7-10之間
# # Split seurat object by condition to perform cell cycle scoring and SCT on all samples
# split_seurat <- SplitObject(pbmc, split.by = "group")
# 
# split_seurat <- split_seurat[c("Group_A", "Group_B","Group_C")]
# 
# for (i in 1:length(split_seurat)) {
#     split_seurat[[i]] <- NormalizeData(split_seurat[[i]], verbose = TRUE)
#     # split_seurat[[i]] <- CellCycleScoring(split_seurat[[i]], g2m.features=g2m_genes, s.features=s_genes)
#     split_seurat[[i]] <- SCTransform(split_seurat[[i]])
#     }
# # Select the most variable features to use for integration
# integ_features <- SelectIntegrationFeatures(object.list = split_seurat,
#                                             nfeatures = 3000)
# # Prepare the SCT list object for integration
# split_seurat <- PrepSCTIntegration(object.list = split_seurat,
#                                    anchor.features = integ_features)
# # Find best buddies - can take a while to run
# integ_anchors <- FindIntegrationAnchors(object.list = split_seurat,
#                                         normalization.method = "SCT",
#                                         anchor.features = integ_features)
# # Integrate across conditions
# seurat_integrated <- IntegrateData(anchorset = integ_anchors,
#                                    normalization.method = "SCT")
# Run PCA
# seurat_integrated <- RunPCA(object = seurat_integrated)
# 
# # Plot PCA
# PCAPlot(seurat_integrated,
#         split.by = "group")
# 
# # Run UMAP
# seurat_integrated <- RunUMAP(seurat_integrated,
#                              dims = 1:40,
#                  reduction = "pca")
# 
# # Plot UMAP
# DimPlot(seurat_integrated,group.by = 'group')
# DimPlot(seurat_integrated,group.by = 'group')
5. Cluster the cellspbmc <- FindNeighbors(pbmc, dims = 1:10)
pbmc <- FindClusters(pbmc, resolution = 0.6) # 解析度為0.6
# 查看一下前幾個的cluster
head(Idents(pbmc), 5)
生成降維圖像(UMAP/tSNE)
pbmc <- RunUMAP(pbmc, dims = 1:10)
DimPlot(pbmc, reduction = "umap", label = TRUE)
6. Finding differentially expressed genes (cluster biomarkers)生成了tSNE圖像後，我們還需要確認這一堆堆的細胞都分別是些什麼細胞，這時，可以使用特定細胞的marker來對不同的細胞群進行標記。但是想要對它們進行標記，首先需要得到每個集群差異表達的基因。
在比較時，Seurat使用一比多的辦法。如果需要對cluster 1的細胞(ident.1)進行差異分析時，它比較的對象將是除了cluster 1以外的其它所有的細胞。也可以使用ident.2參數來傳遞需要比較的對象。在FindMarkers函數中，min.pct是指的marker基因所佔細胞數的最低比例，這裡使用1/4。使用max.cells.per.ident可以讓函數對細胞進行抽樣，以加速計算。FindAllMarkers可以一次性找到所有的高表達的marker
# find all markers of cluster 1
cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 1, min.pct = 0.25)
head(cluster1.markers, n = 5)
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(object = pbmc, ident.1 = 5, 
                                ident.2 = c(0,3), min.pct = 0.25)
head(cluster5.markers, n = 5)
# find markers for every cluster compared to all remaining cells, 
# report only the positive ones
pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE, 
                               min.pct = 0.25, thresh.use = 0.25)
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC) ## top2 marker
還可以使用不同的算法來尋找差異表達的基因。only.pos 參數，可以指定返回positive markers基因。test.use可以指定檢驗方法，可選擇的有：wilcox，bimod，roc，t，negbinom，poisson，LR，MAST，DESeq2
# cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 0, 
#                                 thresh.use = 0.25, test.use = "roc", 
#                                 only.pos = TRUE)
Finds markers that are conserved between the groups
# #構建一個分組方式：
# pbmc[['groups']] <- sample(x = c('g1', 'g2'), size = ncol(x = pbmc), replace = TRUE)
#  head(FindConservedMarkers(pbmc, ident.1 = 0, ident.2 = 1, grouping.var = "groups"))
使用VlnPlot來查看結果
VlnPlot(object = pbmc, features = c("S100A8", "S100A9"),log = TRUE)
VlnPlot(object = pbmc, features = c("MS4A1", "CD79A"))
# you can plot raw UMI counts as well
VlnPlot(object = pbmc, features= c("NKG7", "PF4"), 
        slot = "counts", log = TRUE)
tSNE可以查看不同基因在不同細胞中的表達情況
plot1 = FeaturePlot(object = pbmc, 
            features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", 
                         "FCGR3A", "LYZ", "PPBP", "CD8A"), 
            cols = c("blue", "red"))
plot(plot1)
用heatmap查看
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
# 設置slim.col.label=TRUE將避免列印每個細胞的名稱，而只列印cluster的ID.
plot2 = DoHeatmap(object = pbmc, features = top10$gene) + NoLegend()
plot(plot2)
##7. 對分集細胞進行標記
new.cluster.ids <- c("Naive CD4 T", "Memory CD4 T", 
                     "CD14+ Mono", "B", "CD8 T", 
                     "FCGR3A+ Mono", 
                     "NK", "DC", "Platelet") ### 自定義
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
使用不同的解析度再算一次
pbmc[["ClusterNames_0.6"]] <- Idents(object = pbmc) #生成快照
# 現在我們使用不同的resolution來重新分群。當我們提高resolution的時候，
# 我們可以得到更多的群。
# 之前我們在FindClusters中設置了save.snn=TRUE，所以可以不用再計算SNN了。
# 下面就直接提高一下區分度，設置resolution = 0.8
pbmc <- FindClusters(pbmc, resolution = 0.8)
# 並排畫兩個tSNE,右邊的是用ClusterNames_0.6 (resolution=0.6) 來分組顏色的。
# 我們可以看到當resolution提高之後，CD4 T細胞被分成了兩個亞群。
plot1 <- DimPlot(object = pbmc, reduction = "umap", label = TRUE) + NoLegend()
plot2 <- DimPlot(object = pbmc, reduction = "umap", label = TRUE,
                 group.by = "ClusterNames_0.6") +
  NoLegend()
plot1 + plot2
# 我們再一次尋找不同分集中的markers。
tcell.markers <- FindMarkers(object = pbmc, ident.1 = 0, ident.2 = 1)

# Most of the markers tend to be expressed in C1 (i.e. S100A4). 
# However, we can see that CCR7 is upregulated in 
# C0, strongly indicating that we can differentiate memory from naive CD4 cells.
# cols.use demarcates the color palette from low to high expression

FeaturePlot(object = pbmc, features = c("S100A4", "CCR7"), 
            cols = c("gray", "blue"))