最近雖然在調研單細胞,但是關於單細胞的數據尚未處理過。所以先根據網上的例子跑了一遍流程。於是開始動手,複製忍者卡卡西---寫輪眼技能發動。具體內容摘自Seurat官網,中文注釋來自博客(https://qiubio.com/)以及自己的註解,僅做記錄,並非完全原創。
目前最流行的R包是Seurat,基本能滿足研究者的需求,於是便以這個包來進行練手。具體例子來源於官網 https://satijalab.org/seurat/v3.2/pbmc3k_tutorial.html
library(Seurat)
packageVersion("Seurat")
library(dplyr)
1.導入10X GENOMICS數據library(ggplot2)
# 讀取Peripheral Blood Mononuclear Cells (PBMC)數據
pbmc.data <- Read10X(data.dir = "data/hg19")
# 初始化一個Seurat對象。
# 在初始化的時候,使用每個細胞表達的基因數不小於200,
# 計數基因表達在不少於3個細胞中做為初篩。
pbmc <- CreateSeuratObject(counts = pbmc.data, min.cells = 3,
min.features = 200, project = "10X_PBMC")
pbmc
UMI計數表格對於拿到UMI計數表格的情況,構建一個dgTMatrix
# library(Seurat)
# library(Matrix)
# nUMI.summary.file <- "data/umi_expression_matrix.tsv"
# counts <- list()
# con <- file(nUMI.summary.file, open="r")
# ## 讀取文件頭
# header <- readLines(con, n=1, warn=FALSE)
# header <- strsplit(header, "\\t")[[1]]
# ## 讀取計數
# i <- 1
# while(length(buf <- readLines(con, n=1e6, warn=FALSE))>0){
# buf <- as.matrix(read.delim(text=buf, header=FALSE, row.names=1))
# counts[[i]] <- Matrix(buf, sparse=TRUE)
# i <- i + 1
# }
# counts <- do.call(rbind, counts)
# rownames(counts)[1:5]
# colnames(counts) <- header
# ## 初始化一個Seurat對象
# d <- CreateSeuratObject(counts = counts, min.cells = 10,
# min.features = 200, project = "test_project")
2. Quality Control載入注釋library(rtracklayer)
gtf <- import("data/Homo_sapiens.GRCh38.102.chr.gtf.gz")
gtf <- gtf[gtf$gene_name!=""]
gtf <- gtf[!is.na(gtf$gene_name)]
## protein coding
protein <-
gtf$gene_name[gtf$transcript_biotype %in%
c("IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_LV_gene",
"IG_M_gene", "IG_V_gene", "IG_Z_gene",
"nonsense_mediated_decay", "nontranslating_CDS",
"non_stop_decay", "polymorphic_pseudogene",
"protein_coding", "TR_C_gene", "TR_D_gene", "TR_gene",
"TR_J_gene", "TR_V_gene")]
## mitochondrial genes
mito <- gtf$gene_name[as.character(seqnames(gtf)) %in% "MT"]
## long noncoding
lincRNA <-
gtf$gene_name[gtf$transcript_biotype %in%
c("3prime_overlapping_ncrna", "ambiguous_orf",
"antisense_RNA", "lincRNA", "ncrna_host", "non_coding",
"processed_transcript", "retained_intron",
"sense_intronic", "sense_overlapping")]
## short noncoding
sncRNA <-
gtf$gene_name[gtf$transcript_biotype %in%
c("miRNA", "miRNA_pseudogene", "misc_RNA",
"misc_RNA_pseudogene", "Mt_rRNA", "Mt_tRNA",
"Mt_tRNA_pseudogene", "ncRNA", "pre_miRNA",
"RNase_MRP_RNA", "RNase_P_RNA", "rRNA", "rRNA_pseudogene",
"scRNA_pseudogene", "snlRNA", "snoRNA",
"snRNA_pseudogene", "SRP_RNA", "tmRNA", "tRNA",
"tRNA_pseudogene", "ribozyme", "scaRNA", "sRNA")]
## pseudogene
pseudogene <-
gtf$gene_name[gtf$transcript_biotype %in%
c("disrupted_domain", "IG_C_pseudogene", "IG_J_pseudogene",
"IG_pseudogene", "IG_V_pseudogene", "processed_pseudogene",
"pseudogene", "transcribed_processed_pseudogene",
"transcribed_unprocessed_pseudogene",
"translated_processed_pseudogene",
"translated_unprocessed_pseudogene", "TR_J_pseudogene",
"TR_V_pseudogene", "unitary_pseudogene",
"unprocessed_pseudogene")]
annotations <- list(protein=unique(protein),
mito=unique(mito),
lincRNA=unique(lincRNA),
sncRNA=unique(sncRNA),
pseudogene=unique(pseudogene))
annotations <- annotations[lengths(annotations)>0]
質量控制與細胞選取# Seurat會計算基因數以及UMI數 (nFeature and nCount).
# Seurat會將原始數據保存在RNA slot中,
# 每一行對應一個基因,每一列對應一個細胞.
slotNames(pbmc)Assays(pbmc) ## == names(pbmc@assays)
DefaultAssay(pbmc) ## 查看當前默認的assay。
head(Idents(pbmc)) ## 查看一下當前的細胞分組。# 在計算比例時,使用目標基因中的數值除以總的數值。
# `[[`運算符可以給metadata加column.
percent <- lapply(annotations, function(.ele){
PercentageFeatureSet(pbmc, features = rownames(pbmc)[rownames(pbmc) %in% .ele])
})
# AddMetaData 會在object@meta.data中加一列。這些信息都會在QC中使用到。
for(i in seq_along(percent)){
pbmc[[paste0("percent.", names(percent)[i])]] <- percent[[i]]
}
# AddMetaData也可以直接通過改變pbmc@meta.data來進行.
id <- sample(c("Group_A", "Group_B","Group_C"), nrow(pbmc@meta.data), replace = TRUE)
names(id) <- rownames(pbmc@meta.data) #必須有names
pbmc@meta.data$group <- id
head(pbmc@meta.data)查看不同種類RNA percent
VlnPlot(object = pbmc,
features = c("nFeature_RNA", "nCount_RNA",
paste0("percent.", names(percent))),
ncol =# FeatureScatter是一個用來畫點圖的工具,可以比較不同的metadata。plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mito")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
# 從VlnPlot中,我們可以得到大多數細胞集中的範圍,通過這些信息,可以設置過濾條件。
# 比如可以使用unique gene counts, percent.protein, percentage.mito來進行過濾。
# 這裡我們設置了200 < nGene < 2500, 0.90 < percent.protein < Inf, -Inf < percent.mito < 0.05.
dim(pbmc)
過濾cell和featurepbmc <- subset(x = pbmc,
subset = nFeature_RNA > 200 & nFeature_RNA < 2500 &
percent.protein > 90 &
percent.mito < 5)
dim(pbmc)
3. Normalization# 使用的是"logNormalize", 就是將每個基因的表達量對該細胞總表達量進行平衡,然後乘以一個因子(scale factor, 默認值為10,000), 然後中進行對數轉換。
pbmc <- NormalizeData(pbmc)Variable genes
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst",
nfeatures = 2000)
# 找到變化最大的十個基因
top10 <- head(VariableFeatures(pbmc), 10)
top10# 畫圖查看
plot1 <- VariableFeaturePlot(pbmc)
LabelPoints(plot = plot1, points = top10, repel = TRUE)從上圖中,我們可以看到很多基因都表現出很大的差異性,但是這些差異性並不一定都是事實,有些可能是因為實驗,或者數據採集的方式導入的。因此人們想儘量的消除這些因素帶來的差異。這些差異有可能是因為batch-effect, 或者cell alignment rate,或者其它。這些因素都可以寫入metadata中,然後通過ScaleData函數來消除
all.genes <- rownames(pbmc)
pbmc <- ScaleData(object = pbmc, features = all.genes) # 校正batch等
head(pbmc[["RNA"]]@scale.data[, 1:3])
4. PCA analysispbmc <- RunPCA(object = pbmc, features = VariableFeatures(pbmc))VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")DimPlot(object = pbmc, reduction = "pca")DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)DimHeatmap(pbmc, dims = 1:12, cells = 500, balanced = TRUE)
# 使用jackStraw方法來估計應該使用多少個principal components的基因來進行下遊分析
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)JackStrawPlot(pbmc, dims = 1:15)ElbowPlot(pbmc) # 得到cut.off值設置在7-10之間# # Split seurat object by condition to perform cell cycle scoring and SCT on all samples
# split_seurat <- SplitObject(pbmc, split.by = "group")
#
# split_seurat <- split_seurat[c("Group_A", "Group_B","Group_C")]
#
# for (i in 1:length(split_seurat)) {
# split_seurat[[i]] <- NormalizeData(split_seurat[[i]], verbose = TRUE)
# # split_seurat[[i]] <- CellCycleScoring(split_seurat[[i]], g2m.features=g2m_genes, s.features=s_genes)
# split_seurat[[i]] <- SCTransform(split_seurat[[i]])
# }# # Select the most variable features to use for integration
# integ_features <- SelectIntegrationFeatures(object.list = split_seurat,
# nfeatures = 3000)
# # Prepare the SCT list object for integration
# split_seurat <- PrepSCTIntegration(object.list = split_seurat,
# anchor.features = integ_features)
# # Find best buddies - can take a while to run
# integ_anchors <- FindIntegrationAnchors(object.list = split_seurat,
# normalization.method = "SCT",
# anchor.features = integ_features)
# # Integrate across conditions
# seurat_integrated <- IntegrateData(anchorset = integ_anchors,
# normalization.method = "SCT")# Run PCA
# seurat_integrated <- RunPCA(object = seurat_integrated)
#
# # Plot PCA
# PCAPlot(seurat_integrated,
# split.by = "group")
#
# # Run UMAP
# seurat_integrated <- RunUMAP(seurat_integrated,
# dims = 1:40,
# reduction = "pca")
#
# # Plot UMAP
# DimPlot(seurat_integrated,group.by = 'group')# DimPlot(seurat_integrated,group.by = 'group')
5. Cluster the cellspbmc <- FindNeighbors(pbmc, dims = 1:10)pbmc <- FindClusters(pbmc, resolution = 0.6) # 解析度為0.6
# 查看一下前幾個的cluster
head(Idents(pbmc), 5)生成降維圖像(UMAP/tSNE)
pbmc <- RunUMAP(pbmc, dims = 1:10)DimPlot(pbmc, reduction = "umap", label = TRUE)
6. Finding differentially expressed genes (cluster biomarkers)生成了tSNE圖像後,我們還需要確認這一堆堆的細胞都分別是些什麼細胞,這時,可以使用特定細胞的marker來對不同的細胞群進行標記。但是想要對它們進行標記,首先需要得到每個集群差異表達的基因。
在比較時,Seurat使用一比多的辦法。如果需要對cluster 1的細胞(ident.1)進行差異分析時,它比較的對象將是除了cluster 1以外的其它所有的細胞。也可以使用ident.2參數來傳遞需要比較的對象。在FindMarkers函數中,min.pct是指的marker基因所佔細胞數的最低比例,這裡使用1/4。使用max.cells.per.ident可以讓函數對細胞進行抽樣,以加速計算。FindAllMarkers可以一次性找到所有的高表達的marker
# find all markers of cluster 1
cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 1, min.pct = 0.25)
head(cluster1.markers, n = 5)# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(object = pbmc, ident.1 = 5,
ident.2 = c(0,3), min.pct = 0.25)
head(cluster5.markers, n = 5)# find markers for every cluster compared to all remaining cells,
# report only the positive ones
pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE,
min.pct = 0.25, thresh.use = 0.25)pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC) ## top2 marker還可以使用不同的算法來尋找差異表達的基因。only.pos 參數,可以指定返回positive markers基因。test.use可以指定檢驗方法,可選擇的有:wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2
# cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 0,
# thresh.use = 0.25, test.use = "roc",
# only.pos = TRUE)Finds markers that are conserved between the groups
# #構建一個分組方式:
# pbmc[['groups']] <- sample(x = c('g1', 'g2'), size = ncol(x = pbmc), replace = TRUE)
# head(FindConservedMarkers(pbmc, ident.1 = 0, ident.2 = 1, grouping.var = "groups"))使用VlnPlot來查看結果
VlnPlot(object = pbmc, features = c("S100A8", "S100A9"),log = TRUE)VlnPlot(object = pbmc, features = c("MS4A1", "CD79A"))# you can plot raw UMI counts as well
VlnPlot(object = pbmc, features= c("NKG7", "PF4"),
slot = "counts", log = TRUE)
tSNE可以查看不同基因在不同細胞中的表達情況
plot1 = FeaturePlot(object = pbmc,
features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A",
"FCGR3A", "LYZ", "PPBP", "CD8A"),
cols = c("blue", "red"))
plot(plot1)用heatmap查看
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
# 設置slim.col.label=TRUE將避免列印每個細胞的名稱,而只列印cluster的ID.
plot2 = DoHeatmap(object = pbmc, features = top10$gene) + NoLegend()
plot(plot2)##7. 對分集細胞進行標記
new.cluster.ids <- c("Naive CD4 T", "Memory CD4 T",
"CD14+ Mono", "B", "CD8 T",
"FCGR3A+ Mono",
"NK", "DC", "Platelet") ### 自定義
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()使用不同的解析度再算一次
pbmc[["ClusterNames_0.6"]] <- Idents(object = pbmc) #生成快照
# 現在我們使用不同的resolution來重新分群。當我們提高resolution的時候,
# 我們可以得到更多的群。
# 之前我們在FindClusters中設置了save.snn=TRUE,所以可以不用再計算SNN了。
# 下面就直接提高一下區分度,設置resolution = 0.8
pbmc <- FindClusters(pbmc, resolution = 0.8)# 並排畫兩個tSNE,右邊的是用ClusterNames_0.6 (resolution=0.6) 來分組顏色的。
# 我們可以看到當resolution提高之後,CD4 T細胞被分成了兩個亞群。
plot1 <- DimPlot(object = pbmc, reduction = "umap", label = TRUE) + NoLegend()
plot2 <- DimPlot(object = pbmc, reduction = "umap", label = TRUE,
group.by = "ClusterNames_0.6") +
NoLegend()
plot1 + plot2# 我們再一次尋找不同分集中的markers。
tcell.markers <- FindMarkers(object = pbmc, ident.1 = 0, ident.2 = 1)
# Most of the markers tend to be expressed in C1 (i.e. S100A4).
# However, we can see that CCR7 is upregulated in
# C0, strongly indicating that we can differentiate memory from naive CD4 cells.
# cols.use demarcates the color palette from low to high expressionFeaturePlot(object = pbmc, features = c("S100A4", "CCR7"),
cols = c("gray", "blue"))