基因富集分析
作者:互联网
前面已经探究了KEGG富集分析的做法,但是存在一些问题。现在进行一些尝试:
尝试1:直接用斑马鱼的基因组为背景进行富集分析:【做KEGG富集分析,必须要:KEGG,NCBI和UniProt的基因编码形式,如果不是,就需要转换】
但是我的基因最先是NCBI蛋白序列的基因编码,因此要先找到蛋白编码与NCBI中GeneID的对应关系。
(base) ssy49@biot2-PowerEdge-R840 2022-08-02 07:58:07 ~/project/01_oxygen/ortho/05orth/OrthoFinder/Results_Mar28/expend_gene $grep ">" *.fa2 >>expendGeneList.txt #获取基因 $sed "s/OG.*>//g" expendGeneList.txt |awk -F"_" '{print $1"_"$2}' >z_expProtId.txt #获取蛋白编码 #由于只按照斑马鱼注释,所以只提取斑马鱼基因编码即可 $sed "s/OG.*>//g" expendGeneList.txt |grep "Danio_rerio" |awk -F"_" '{print $1"_"$2}' >z_expProtId.txt2 (base) ssy49@biot2-PowerEdge-R840 2022-08-02 08:07:26 ~/project/01_oxygen/Obli-W/D_rerio $cp /home/data/ssy49/project/01_oxygen/ortho/05orth/OrthoFinder/Results_Mar28/expend_gene/z_expProtId.txt2 . $for i in `cat z_expProtId.txt2`; do grep -w $i GCF_000002035.6_GRCz11_genomic.gff >>zong.txt; done
cat zong.txt |awk '{for (i=9;i<=NF;i++)printf("%s ", $i);print ""}' >zong.txt2
cat zong.txt2 |awk -F";" '{print $4"\t"$(NF)"\t"$(NF-1)"\t"$(NF-3)}' >zong.txt5
awk '{FS = "\t"}{for (f=1; f <= NF; f+=1) {if ($f ~ /gene=/) {print NR,$f}}}' zong5 >zong6
cat zong6 |awk '{print $2}' |sed "s/gene=//g"|uniq > proGeneId.txt
整理好的基因列表直接就可以用R做富集分析了【KEGG和GO都是用的同一个基因列表输入文件】:
KEGG分析:注意KEGG富集是没有三个层次分别的,只有GO才有
#20020803-KEGG分析 #scriptName:keggAnalysis.R library("BiocManager") BiocManager::install("clusterProfiler") library("clusterProfiler") BiocManager::install("org.Dr.eg.db",dependencies = TRUE) #查看有哪些库可用:https://www.omicsclass.com/article/262 library("org.Dr.eg.db") BiocManager::install("enrichplot") library("enrichplot") library("ggplot2") BiocManager::install("pathview") library("pathview") BiocManager::install("ggnewscale") library("ggnewscale") library("DOSE") pvalueFilter=0.05 qvalueFilter=1 showNum=20 rt=read.table("expGene.txt") # head(rt) genes=as.vector(rt[,1]) head(genes) entrezIDs <- mget(genes, org.Dr.egSYMBOL2EG, ifnotfound = NA) ##就是用org.Dr.eg.db转换基因ID,很多网站也是可以转换的https://zhuanlan.zhihu.com/p/148116840 entrezIDs <- as.character(entrezIDs) rt=cbind(rt, entrezID=entrezIDs) colnames(rt)=c("symbol","entrezID") gene=rt$entrezID gene=unique(gene) colorSel="qvalue" if(qvalueFilter>0.05){ colorSel="pvalue" } # install.packages("R.utils") R.utils::setOption("clusterProfiler.download.method",'auto') options(clusterProfiler.download.method = "auto") kk <- enrichKEGG(gene = gene, organism = "dre", pvalueCutoff = 1, qvalueCutoff = 1) #http://www.genome.jp/kegg/catalog/org_list.html KEGG=as.data.frame(kk) pdf(file = "kegg_barplot.pdf", width = 9, height = 7) barplot(kk, drop=TRUE, showCategory = showNum, color = colorSel) dev.off() pdf(file = "kegg_dotplot.pdf", width = 9, height = 7) dotplot(kk, showCategory = showNum, orderBy = "GeneRation", color = colorSel) dev.off() pdf(file = "kegg_cnetplot.pdf", width = 9, height = 7) af=setReadable(kk, 'org.Dr.eg.db', 'ENTREZID') cnetplot(af, showCategory = 3, categorySize='pvalue', circular=TRUE, cex_label_category=0.65, cex_label_gene=0.6) dev.off() pdf(file = "kegg_net.pdf", width = 9, height = 7) x2 <- pairwise_termsim(kk) emapplot(x2, showCategory = 20, cex_label_category=0.65, color="pvalue", layout="nicely") dev.off() #没有成功,因为有一个基因有问题。参考GO分析 #画某一通路(一定要有网) keggId="dre04530" #KEGG变量里就有各个通路 geneFC=rep(1, length(gene)) names(geneFC)=gene pv.out=pathview(gene.data = geneFC,pathway.id = keggId, species = "dre",outsuffix="pathview") p <- pathview(gene.data = geneFC, pathway.id = keggId, species = "dre",kegg.native = F,sign.pos="bottomleft",same.layer=F)
可能遇到的报错1【也可参考这个;参考】【该错误尝试了换网、断网重连、关机等都没有解决,准备换到服务器的Rstudio上试试,结果一下就可以了(看能跟人品有点关系)】
#单个信号通路图
GO分析:出4种图【上面KEGG中血红蛋白基因我查了还是没有富集到,但GO很明显富集出来了,也不知道是不是KEGG库里边本来就没有血红蛋白能富集的通路,就像前面做的,KEGG手动富集,也不能把血红蛋白富集到】
#20220803-GO富集分析 #scriptName: GoAnalysis.R library("clusterProfiler") library("org.Dr.eg.db") library("enrichplot") library("ggplot2") library("ggnewscale") library("DOSE") pvalueFilter=0.05 qvalueFilter=1 showNum=8 #GO富集有三个层次CC/BP/MF,而KEGG没有层次之分.这里设置8,那么已经有8×3=24个了 rt=read.table("expGene.txt") genes=as.vector(rt[,1]) entrezIDs <- mget(genes, org.Dr.egSYMBOL2EG, ifnotfound = NA) entrezIDs <- as.character(entrezIDs) rt=cbind(rt, entrezID=entrezIDs) colnames(rt)=c("symbol","entrezID") rt=rt[is.na(rt[,"entrezID"])==F,] gene=rt$entrezID gene=unique(gene) write.table(gene,file = "gene.txt") colorSel="qvalue" if(qvalueFilter>0.05){ colorSel="pvalue" } gene2 <- gsub('c.*)', '', gene) #gene变量中有一个有问题,因此需要删除 # write.table(gene2,file = "gene2.txt") kk=enrichGO(gene = gene2,OrgDb = org.Dr.eg.db,pvalueCutoff = 1,qvalueCutoff = 1,ont = "all",readable = T) GO=as.data.frame(kk) GO=GO[(GO$pvalue<pvalueFilter & GO$qvalue<qvalueFilter),] # write.table(GO,file = "GO.txt", sep = "\t",quote = F,row.names = F) if (nrow(GO)<30) { showNum=nrow(GO) } pdf(file = "GO_barplot.pdf", width = 9, height = 7) bar=barplot(kk,showCategory = showNum,split="ONTOLOGY",color = colorSel)+facet_grid(ONTOLOGY~.,scale="free")+scale_y_discrete(labels=function(x)stringr::str_wrap(x,width = 60)) print(bar) dev.off() pdf(file = "GO_bubble.pdf", width = 9, height = 7) bub=dotplot(kk,showCategory = showNum,orderBy="GeneRatio",split="ONTOLOGY",color = colorSel)+facet_grid(ONTOLOGY~.,scale="free")+scale_y_discrete(labels=function(x)stringr::str_wrap(x,width = 60)) print(bub) dev.off() pdf(file = "GO_cnet.pdf", width = 9, height = 7) af=setReadable(kk,'org.Dr.eg.db','ENTREZID') cnetplot(af, showCategory = 10, categorySize='pvalue', circular=TRUE, colorEdge=TRUE,cex_label_category=0.65, cex_label_gene=0.6) dev.off() pdf(file = "GO_net.pdf", width = 9, height = 7) x2 <- pairwise_termsim(kk) emapplot(x2, showCategory = 20, cex_label_category=0.65, color="pvalue", layout="nicely") dev.off()
标签:富集,分析,KEGG,基因,library,install,GO,txt 来源: https://www.cnblogs.com/ly-zy/p/16558453.html