TopGO做GO富集并可视化   

TopGO做GO富集并可视化

TopGO做GO富集并可视化

24 Dec 2016

Go back Your Document Title

Your Document Title

Document Author

2016-12-24

This is the practice of the blog about use topGO to draw DAG, and the topGOEsayUse

1 topGO package 核心

利用这个R包做GO 富集的核心是构建一个topGOdata对象. “The central step in using the topGO package is to create a topGOdata object[^ Official Mual].”

构建这个对象的必要参数是:

  1. ontology,可指定要分析的GO term的类型,即BP、CC和MF;
  2. description:对这个对象的描述,可选但是很重要
  3. allGenes:基因identifier的原始列表[^ 基因identifier,就像geneList这样的变量。可以附上某种分数以便后面施用某种统计处理,分数可以是t检验的p值或者与某个表型的correlation等;],和后面的geneSelectionFun联合作用,得出参与分析的基因,可以是numeric或factor。
  4. geneSelectionFun:基因选择函数,如果前面allGenes是numeric的话就必须得指明此参数;
  5. annotationFun:基因identifier map到GO term的函数。

然后用new()实例化(这是一个java中的术语)一个对象就可以做后续的分析了。

2 下面介绍完整的topGOdata构建流程

2.1 获取ensembl和GO号的对应关系: geneID2GO

# Be careful to run, the data is extremely large 
library(biomaRt)
listEnsembl()#查看可用数据
genes = useEnsembl(biomart="ensembl",dataset="hsapiens_gene_ensembl")
# 得到go信息和gene
gene2goInfo <- getBM(attributes=c('ensembl_gene_id','go_id','entrezgene','name_1006','go_linkage_type','namespace_1003'), mart = genes)
# 过滤
gene2goInfo=gene2goInfo[gene2goInfo$go_id != "", ]
# 上述是一个gene对应一个go id,需要合并为一个gene对应多个go id,利用by函数(神器)
geneID2GO = by(gene2goInfo$go_id, gene2goInfo$ensembl_gene_id, function(x) as.character(x))
save.image("dat.RData")

geneID2GO是符合topGO要求的,gene2goInfo和geneID2GO的格式如下

# the data is very large, so we directe load
load("dat.RData")
head(gene2goInfo)
##   ensembl_gene_id      go_id entrezgene
## 3 ENSG00000281614 GO:0004445       3635
## 4 ENSG00000281614 GO:0005515       3635
## 5 ENSG00000281614 GO:0016314       3635
## 6 ENSG00000281614 GO:0016787       3635
## 7 ENSG00000281614 GO:0017124       3635
## 8 ENSG00000281614 GO:0034485       3635
##                                                         name_1006
## 3                   inositol-polyphosphate 5-phosphatase activity
## 4                                                 protein binding
## 5 phosphatidylinositol-3,4,5-trisphosphate 3-phosphatase activity
## 6                                              hydrolase activity
## 7                                              SH3 domain binding
## 8 phosphatidylinositol-3,4,5-trisphosphate 5-phosphatase activity
##   go_linkage_type     namespace_1003
## 3             TAS molecular_function
## 4             IPI molecular_function
## 5             TAS molecular_function
## 6             IEA molecular_function
## 7             IEA molecular_function
## 8             TAS molecular_function
head(geneID2GO)
## $ENSG00000000003
##  [1] "GO:0070062" "GO:0016021" "GO:0016020" "GO:0005887" "GO:1901223"
##  [6] "GO:0043123" "GO:0039532" "GO:0007166" "GO:0005515" "GO:0004871"
## 
## $ENSG00000000005
##  [1] "GO:0001886" "GO:0001937" "GO:0016525" "GO:0035990" "GO:0071773"
##  [6] "GO:0005634" "GO:0005635" "GO:0005737" "GO:0016020" "GO:0016021"
## 
## $ENSG00000000419
##  [1] "GO:0033185" "GO:0016020" "GO:0005789" "GO:0005789" "GO:0005783"
##  [6] "GO:0005783" "GO:0005634" "GO:0035269" "GO:0035268" "GO:0019348"
## [11] "GO:0018279" "GO:0006506" "GO:0006487" "GO:0006486" "GO:0005515"
## [16] "GO:0004582" "GO:0004582" "GO:0004169" "GO:0043231" "GO:0097502"
## [21] "GO:0019673" "GO:0019348" "GO:0043178" "GO:0016757" "GO:0016740"
## [26] "GO:0005537"
## 
## $ENSG00000000457
##  [1] "GO:0016301" "GO:0005515" "GO:0005524" "GO:0006468" "GO:0016477"
##  [6] "GO:0005737" "GO:0005737" "GO:0005794" "GO:0005794" "GO:0030027"
## [11] "GO:0030027" "GO:0042995" "GO:0004672"
## 
## $ENSG00000000938
##  [1] "GO:0070062" "GO:0042995" "GO:0032587" "GO:0031234" "GO:0016020"
##  [6] "GO:0015629" "GO:0005886" "GO:0005886" "GO:0005829" "GO:0005829"
## [11] "GO:0005758" "GO:0005743" "GO:0005739" "GO:0005737" "GO:0050830"
## [16] "GO:0050830" "GO:0050764" "GO:0050764" "GO:0050715" "GO:0050715"
## [21] "GO:0046777" "GO:0045859" "GO:0045859" "GO:0045088" "GO:0045088"
## [26] "GO:0045087" "GO:0045087" "GO:0043552" "GO:0043306" "GO:0043306"
## [31] "GO:0042127" "GO:0038096" "GO:0038083" "GO:0030335" "GO:0030335"
## [36] "GO:0030154" "GO:0018108" "GO:0018108" "GO:0016477" "GO:0016310"
## [41] "GO:0014068" "GO:0009615" "GO:0008360" "GO:0008360" "GO:0007229"
## [46] "GO:0007229" "GO:0007229" "GO:0007169" "GO:0006468" "GO:0006468"
## [51] "GO:0002768" "GO:0002376" "GO:0034988" "GO:0034987" "GO:0034987"
## [56] "GO:0019901" "GO:0019901" "GO:0016740" "GO:0016301" "GO:0005524"
## [61] "GO:0005515" "GO:0004715" "GO:0004715" "GO:0004715" "GO:0004713"
## [66] "GO:0004713" "GO:0004713" "GO:0004713" "GO:0004672" "GO:0001784"
## [71] "GO:0001784" "GO:0000166" "GO:0032587" "GO:0015629" "GO:0005856"
## [76] "GO:0005856"
## 
## $ENSG00000000971
##  [1] "GO:0072562" "GO:0070062" "GO:0005615" "GO:0005576" "GO:0005576"
##  [6] "GO:0045087" "GO:0030449" "GO:0006957" "GO:0006956" "GO:0002376"
## [11] "GO:0043395" "GO:0008201" "GO:0005515"

2.2 构建基因列表:geneList

# cerate all genes for test
all_genes <- unique(head(gene2goInfo$ensembl_gene_id,40000))
DE_genes <- all_genes[sample(1:length(all_genes),300,replace = F)]
# constrct the geneList
geneList <- factor(as.integer(all_genes %in% DE_genes))
names(geneList) <- all_genes

If you read data from external data, you can use following code!

diff=read.table("./gene_exp.diff",sep="\t",header=TRUE)
# 差异表达基因
interesting_genes=factor(diff[diff$significant=="yes",]$gene_id)
# 所有基因
all_genes <- diff$gene_id
# 构建基因列表
geneList <- factor(as.integer (all_genes %in% interesting_genes))
names(geneList)=all_genes

2.3 构建topGO对象

library(topGO)
GOdata <- new("topGOdata", ontology = "MF", allGenes = geneList,annot = annFUN.gene2GO, gene2GO = geneID2GO)
# You may confuse the parameter annot, don't worry. YOU CAN FOUND the explaination in the official reference

2.4 基因富集分析

resultFisher <- runTest(GOdata, algorithm = "classic", statistic = "fisher")
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 611 nontrivial nodes
##       parameters: 
##           test statistic:  fisher
allRes <- GenTable(GOdata,classicFisher = resultFisher, orderBy = "classicFisher", topNodes = 10)
knitr::kable(as.data.frame(allRes))
GO.ID Term Annotated Significant Expected classicFisher
GO:0004872 receptor activity 188 39 26.83 0.0067
GO:0004930 G-protein coupled receptor activity 98 23 13.98 0.0083
GO:0060089 molecular transducer activity 218 43 31.11 0.0113
GO:0004888 transmembrane signaling receptor activit… 148 31 21.12 0.0134
GO:1901618 organic hydroxy compound transmembrane t… 8 4 1.14 0.0177
GO:0008020 G-protein coupled photoreceptor activity 2 2 0.29 0.0203
GO:0000400 four-way junction DNA binding 5 3 0.71 0.0230
GO:0038023 signaling receptor activity 158 31 22.55 0.0327
GO:0004984 olfactory receptor activity 43 11 6.14 0.0339
GO:0004871 signal transducer activity 188 35 26.83 0.0488

2.5 produce the DAG for viusualization

# svg("go.dag.svg")
# pdf("test.pdf")
showSigOfNodes(GOdata, score(resultFisher), firstSigNodes = 5, useInfo = 'all')
## Loading required package: Rgraphviz
## Loading required package: grid
## 
## Attaching package: 'grid'
## The following object is masked from 'package:topGO':
## 
##     depth

## $dag
## A graphNEL graph with directed edges
## Number of Nodes = 10 
## Number of Edges = 10 
## 
## $complete.dag
## [1] "A graph with 10 nodes."
# dev.off()
###########
data(GOdata)
data(results.tGO)

showSigOfNodes(GOdata, score(resultFisher), firstSigNodes = 6, useInfo = 'all')

## $dag
## A graphNEL graph with directed edges
## Number of Nodes = 45 
## Number of Edges = 86 
## 
## $complete.dag
## [1] "A graph with 45 nodes."
# directly save the picture without call the PDF device
# printGraph(GOdata, resultFisher, firstSigNodes = 5, fn.prefix = "sampleFile", useInfo = "all", pdfSW = TRUE)