其他分享
首页 > 其他分享> > 基因富集分析

基因富集分析

作者:互联网

前面已经探究了KEGG富集分析的做法,但是存在一些问题。现在进行一些尝试:

尝试1:直接用斑马鱼的基因组为背景进行富集分析:【做KEGG富集分析,必须要:KEGG,NCBI和UniProt的基因编码形式,如果不是,就需要转换】

  但是我的基因最先是NCBI蛋白序列的基因编码,因此要先找到蛋白编码与NCBI中GeneID的对应关系。

(base) ssy49@biot2-PowerEdge-R840 2022-08-02 07:58:07 ~/project/01_oxygen/ortho/05orth/OrthoFinder/Results_Mar28/expend_gene
$grep ">" *.fa2 >>expendGeneList.txt  #获取基因
$sed "s/OG.*>//g" expendGeneList.txt |awk -F"_" '{print $1"_"$2}' >z_expProtId.txt #获取蛋白编码

#由于只按照斑马鱼注释,所以只提取斑马鱼基因编码即可
$sed "s/OG.*>//g" expendGeneList.txt |grep "Danio_rerio" |awk -F"_" '{print $1"_"$2}' >z_expProtId.txt2

(base) ssy49@biot2-PowerEdge-R840 2022-08-02 08:07:26 ~/project/01_oxygen/Obli-W/D_rerio
$cp /home/data/ssy49/project/01_oxygen/ortho/05orth/OrthoFinder/Results_Mar28/expend_gene/z_expProtId.txt2 .
$for i in `cat z_expProtId.txt2`; do grep -w $i GCF_000002035.6_GRCz11_genomic.gff >>zong.txt; done

cat zong.txt |awk '{for (i=9;i<=NF;i++)printf("%s ", $i);print ""}' >zong.txt2
cat zong.txt2 |awk -F";" '{print $4"\t"$(NF)"\t"$(NF-1)"\t"$(NF-3)}' >zong.txt5
awk '{FS = "\t"}{for (f=1; f <= NF; f+=1) {if ($f ~ /gene=/) {print NR,$f}}}' zong5 >zong6
cat zong6 |awk '{print $2}' |sed "s/gene=//g"|uniq > proGeneId.txt

 

整理好的基因列表直接就可以用R做富集分析了【KEGG和GO都是用的同一个基因列表输入文件】:

KEGG分析:注意KEGG富集是没有三个层次分别的,只有GO才有

#20020803-KEGG分析
#scriptName:keggAnalysis.R
library("BiocManager")
BiocManager::install("clusterProfiler")
library("clusterProfiler")
BiocManager::install("org.Dr.eg.db",dependencies = TRUE) #查看有哪些库可用:https://www.omicsclass.com/article/262
library("org.Dr.eg.db")
BiocManager::install("enrichplot")
library("enrichplot")
library("ggplot2")
BiocManager::install("pathview")
library("pathview")
BiocManager::install("ggnewscale")
library("ggnewscale")
library("DOSE")
pvalueFilter=0.05
qvalueFilter=1
showNum=20


rt=read.table("expGene.txt")
# head(rt)
genes=as.vector(rt[,1])
head(genes)
entrezIDs <- mget(genes, org.Dr.egSYMBOL2EG, ifnotfound = NA) ##就是用org.Dr.eg.db转换基因ID,很多网站也是可以转换的https://zhuanlan.zhihu.com/p/148116840
entrezIDs <- as.character(entrezIDs)
rt=cbind(rt, entrezID=entrezIDs)
colnames(rt)=c("symbol","entrezID")

gene=rt$entrezID
gene=unique(gene)
colorSel="qvalue"
if(qvalueFilter>0.05){
  colorSel="pvalue"
}
# install.packages("R.utils")
R.utils::setOption("clusterProfiler.download.method",'auto')
options(clusterProfiler.download.method = "auto")
kk <- enrichKEGG(gene = gene, organism = "dre", pvalueCutoff = 1, qvalueCutoff = 1)  #http://www.genome.jp/kegg/catalog/org_list.html
KEGG=as.data.frame(kk)

pdf(file = "kegg_barplot.pdf", width = 9, height = 7)
barplot(kk, drop=TRUE, showCategory = showNum, color = colorSel)
dev.off()

pdf(file = "kegg_dotplot.pdf", width = 9, height = 7)
dotplot(kk, showCategory = showNum, orderBy = "GeneRation", color = colorSel)
dev.off()

pdf(file = "kegg_cnetplot.pdf", width = 9, height = 7)
af=setReadable(kk, 'org.Dr.eg.db', 'ENTREZID')
cnetplot(af, showCategory = 3, categorySize='pvalue', circular=TRUE, cex_label_category=0.65, cex_label_gene=0.6)
dev.off()

pdf(file = "kegg_net.pdf", width = 9, height = 7)
x2 <- pairwise_termsim(kk)
emapplot(x2, showCategory = 20, cex_label_category=0.65, color="pvalue", layout="nicely")
dev.off() #没有成功,因为有一个基因有问题。参考GO分析

#画某一通路(一定要有网)
keggId="dre04530" #KEGG变量里就有各个通路
geneFC=rep(1, length(gene))
names(geneFC)=gene
pv.out=pathview(gene.data = geneFC,pathway.id = keggId, species = "dre",outsuffix="pathview")
p <- pathview(gene.data = geneFC, pathway.id = keggId, species = "dre",kegg.native = F,sign.pos="bottomleft",same.layer=F) 

可能遇到的报错1也可参考这个参考】【该错误尝试了换网、断网重连、关机等都没有解决,准备换到服务器的Rstudio上试试,结果一下就可以了(看能跟人品有点关系)】

      

#单个信号通路图

     

GO分析:出4种图【上面KEGG中血红蛋白基因我查了还是没有富集到,但GO很明显富集出来了,也不知道是不是KEGG库里边本来就没有血红蛋白能富集的通路,就像前面做的,KEGG手动富集,也不能把血红蛋白富集到】

#20220803-GO富集分析
#scriptName: GoAnalysis.R
library("clusterProfiler")
library("org.Dr.eg.db")
library("enrichplot")
library("ggplot2")
library("ggnewscale")
library("DOSE")
pvalueFilter=0.05
qvalueFilter=1
showNum=8 #GO富集有三个层次CC/BP/MF,而KEGG没有层次之分.这里设置8,那么已经有8×3=24个了

rt=read.table("expGene.txt")
genes=as.vector(rt[,1])
entrezIDs <- mget(genes, org.Dr.egSYMBOL2EG, ifnotfound = NA) 
entrezIDs <- as.character(entrezIDs)
rt=cbind(rt, entrezID=entrezIDs)
colnames(rt)=c("symbol","entrezID")
rt=rt[is.na(rt[,"entrezID"])==F,]
gene=rt$entrezID
gene=unique(gene)
write.table(gene,file = "gene.txt")
colorSel="qvalue"
if(qvalueFilter>0.05){
  colorSel="pvalue"
}

gene2 <- gsub('c.*)', '', gene) #gene变量中有一个有问题,因此需要删除
# write.table(gene2,file = "gene2.txt")
kk=enrichGO(gene = gene2,OrgDb = org.Dr.eg.db,pvalueCutoff = 1,qvalueCutoff = 1,ont = "all",readable = T)
GO=as.data.frame(kk)
GO=GO[(GO$pvalue<pvalueFilter & GO$qvalue<qvalueFilter),]
# write.table(GO,file = "GO.txt", sep = "\t",quote = F,row.names = F)

if (nrow(GO)<30) {
  showNum=nrow(GO)
}

pdf(file = "GO_barplot.pdf", width = 9, height = 7)
bar=barplot(kk,showCategory = showNum,split="ONTOLOGY",color = colorSel)+facet_grid(ONTOLOGY~.,scale="free")+scale_y_discrete(labels=function(x)stringr::str_wrap(x,width = 60))
print(bar)
dev.off()

pdf(file = "GO_bubble.pdf", width = 9, height = 7)
bub=dotplot(kk,showCategory = showNum,orderBy="GeneRatio",split="ONTOLOGY",color = colorSel)+facet_grid(ONTOLOGY~.,scale="free")+scale_y_discrete(labels=function(x)stringr::str_wrap(x,width = 60))
print(bub)
dev.off()

pdf(file = "GO_cnet.pdf", width = 9, height = 7)
af=setReadable(kk,'org.Dr.eg.db','ENTREZID')
cnetplot(af, showCategory = 10, categorySize='pvalue', circular=TRUE, colorEdge=TRUE,cex_label_category=0.65, cex_label_gene=0.6)
dev.off()

pdf(file = "GO_net.pdf", width = 9, height = 7)
x2 <- pairwise_termsim(kk)
emapplot(x2, showCategory = 20, cex_label_category=0.65, color="pvalue", layout="nicely")
dev.off()

      

标签:富集,分析,KEGG,基因,library,install,GO,txt
来源: https://www.cnblogs.com/ly-zy/p/16558453.html