微信公眾號:生信菜鳥團
關注可了解更多的教程。問題或建議,請公眾號留言;
如果你覺得本教程對你有幫助,歡迎讚賞
晶片的處理流程一般就是:數據讀入——數據過濾——數據校正——下遊分析。
讀文章拿到測序數據
本次講解用到的數據來自文章The relationship between DNA methylation, genetic and expression inter-individual variation in untransformed human fibroblast
從文章裡面找到數據存放地址如下:https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE52025
直接下載即可:
我下載的數據如下:
接著我們需要跟樣本分組,這裡我是按照GM和WG分組比較:
library(limma)
library(minfi)
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
library(IlluminaHumanMethylation450kmanifest)
library(RColorBrewer)
library(missMethyl)
library(matrixStats)
library(minfiData)
rgSet <- read.metharray.exp("GSE52025/idat")
head(sampleNames(rgSet))
sampleNames(rgSet) <- substr(sampleNames(rgSet),"1", "10" )
建立樣本分組信息,分組信息我是基於下圖對應關係,構建了pD對象如下:
pD對象我是自己都進去整理的,部分代碼如下:
pD <- pD[sampleNames(rgSet),]
sampleNames(rgSet) <- rownames(pD)
pD$group=substr(rownames(pD),"1", "2" )
計算檢測p值detP <- detectionP(rgSet)
pal <- brewer.pal(8,"Dark2")
par(mfrow=c(1,2))
barplot(colMeans(detP), col=pal[factor(pD$group)], las=2,
cex.names=0.8, ylab="Mean detection p-values")
abline(h=0.05,col="red")
legend("topleft", legend=levels(factor(pD$group)), fill=pal,
bg="white")
barplot(colMeans(detP), col=pal[factor(pD$group)], las=2,
cex.names=0.8, ylim=c(0,0.002), ylab="Mean detection p-values")
abline(h=0.05,col="red")
legend("topleft", legend=levels(factor(pD$group)), fill=pal,
bg="white")
keep <- colMeans(detP) < 0.01
rgSet <- rgSet[,keep]
rgSet
detP <- detP[,keep]
dim(detP)
mSetSq <- preprocessQuantile(rgSet)
par(mfrow=c(1,2))
densityPlot(rgSet, sampGroups=pD$group,main="Raw", legend=FALSE)
legend("top", legend = levels(factor(pD$group)),
text.col=brewer.pal(8,"Dark2"))
densityPlot(getBeta(mSetSq), sampGroups=pD$group,
main="Normalized", legend=FALSE)
legend("top", legend = levels(factor(pD$group)),
text.col=brewer.pal(8,"Dark2"))
detP <- detP[match(featureNames(mSetSq),rownames(detP)),]
keep <- rowSums(detP < 0.01) == ncol(mSetSq)
table(keep)
mSetSqFlt <- mSetSq[keep,]
mSetSqFlt
ann450k = getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
head(ann450k)
keep <- !(featureNames(mSetSqFlt) %in% ann450k$Name[ann450k$chr %in% c("chrX","chrY")])
table(keep)
mSetSqFlt <- mSetSqFlt[keep,]
mSetSqFlt <- dropLociWithSnps(mSetSqFlt)
mSetSqFlt
網址:http://www.sickkids.ca/MS-Office-Files/Research/Weksberg%20Lab/48639-non-specific-probes-Illumina450k.xlsx
下載下來讀進去即可
xReactiveProbes <- read.csv("C:/Users/Administrator/Downloads/48639-non-specific-probes-Illumina450k.csv",sep=",", stringsAsFactors=FALSE)
keep <- !(featureNames(mSetSqFlt) %in% xReactiveProbes$TargetID)
table(keep)
mSetSqFlt <- mSetSqFlt[keep,]
mSetSqFlt
一旦數據被過濾和標準化,重新檢查MDS圖以查看樣本之間的關係是否發生了變化通常是有用的
par(mfrow=c(1,2))
plotMDS(getM(mSetSqFlt), top=1000, gene.selection="common",
col=pal[factor(pD$group)], cex=0.8)
legend("right", legend=levels(factor(pD$group)), text.col=pal,
cex=0.65, bg="white")
plotMDS(getM(mSetSqFlt), top=1000, gene.selection="common",
col=pal[factor(pD$group)])
legend("right", legend=levels(factor(pD$group)), text.col=pal,
cex=0.7, bg="white")
一旦數據被過濾和標準化,重新檢查MDS圖以查看樣本之間的關係是否發生了變化通常是有用的
par(mfrow=c(1,3))
# Examine higher dimensions to look at other sources of variation
plotMDS(getM(mSetSqFlt), top=1000, gene.selection="common",
col=pal[factor(pD$group)], dim=c(1,3))
legend("right", legend=levels(factor(pD$group)), text.col=pal,
cex=0.7, bg="white")
plotMDS(getM(mSetSqFlt), top=1000, gene.selection="common",
col=pal[factor(pD$group)], dim=c(2,3))
legend("topright", legend=levels(factor(pD$group)), text.col=pal,
cex=0.7, bg="white")
plotMDS(getM(mSetSqFlt), top=1000, gene.selection="common",
col=pal[factor(pD$group)], dim=c(3,4))
legend("right", legend=levels(factor(pD$group)), text.col=pal,
cex=0.7, bg="white")
下一步是計算m值和beta值。m值具有更好的統計特性,因此更適合用於甲基化數據的統計分析,而beta值易於解釋,因此更適合於顯示數據
mVals <- getM(mSetSqFlt)
head(mVals[,1:5])
bVals <- getBeta(mSetSqFlt)
head(bVals[,1:5])
繪製密度曲線
par(mfrow=c(1,2))
densityPlot(bVals, sampGroups=pD$group, main="Beta values",
legend=FALSE, xlab="Beta values")
legend("top", legend = levels(factor(pD$group)),
text.col=brewer.pal(8,"Dark2"))
densityPlot(mVals, sampGroups=pD$group, main="M-values",
legend=FALSE, xlab="M values")
legend("topleft", legend = levels(factor(pD$group)),
text.col=brewer.pal(8,"Dark2"))
group <- factor(pD$group)
# use the above to create a design matrix
design <- model.matrix(~0+group, data=pD)
colnames(design) <- c(levels(group))
fit <- lmFit(bVals, design)
contMatrix <- makeContrasts(GM-WG,
levels=design)
contMatrix
fit2 <- contrasts.fit(fit, contMatrix)
fit2 <- eBayes(fit2)
ann450kSub <- ann450k[match(rownames(mVals),ann450k$Name), c(1:4,12:19,24:ncol(ann450k))]
DMPs <- topTable(fit2, num=Inf, coef=1, genelist=ann450kSub)
head(DMPs)
繪製cpg表達
par(mfrow=c(2,2))
purrr::map(rownames(DMPs)[1:4], function(cpg){
plotCpg(bVals, cpg=cpg, pheno=pD$group, ylab = "Beta values")
})
sigCpGs <- DMPs$Name[DMPs$adj.P.Val<0.05]
sigCpGs[1:10]
all <- DMPs$Name
length(all)
par(mfrow=c(1,1))
gst <- gometh(sig.cpg=sigCpGs, all.cpg=all, plot.bias=TRUE)
#Top 10 GO categories
topGO(gst, number=10)
kegg <- gometh(sig.cpg = sigCpGs, all.cpg = all, collection = "KEGG", prior.prob=TRUE)
topKEGG(kegg)
http://master.bioconductor.org/packages/release/workflows/vignettes/methylationArrayAnalysis/inst/doc/methylationArrayAnalysis.html