This is the practice of the blog about use topGO to draw DAG, and the topGOEsayUse

1 topGO package 核心

利用这个R包做GO 富集的核心是构建一个topGOdata对象. “The central step in using the topGO package is to create a topGOdata object[^ Official Mual].”

构建这个对象的必要参数是：

ontology，可指定要分析的GO term的类型，即BP、CC和MF；
description:对这个对象的描述，可选但是很重要
allGenes：基因identifier的原始列表[^ 基因identifier，就像geneList这样的变量。可以附上某种分数以便后面施用某种统计处理，分数可以是t检验的p值或者与某个表型的correlation等；]，和后面的geneSelectionFun联合作用，得出参与分析的基因，可以是numeric或factor。
geneSelectionFun：基因选择函数，如果前面allGenes是numeric的话就必须得指明此参数；
annotationFun：基因identifier map到GO term的函数。

然后用new()实例化(这是一个java中的术语)一个对象就可以做后续的分析了。

2 下面介绍完整的topGOdata构建流程

2.1 获取ensembl和GO号的对应关系: geneID2GO

# Be careful to run, the data is extremely large 
library(biomaRt)
listEnsembl()#查看可用数据
genes = useEnsembl(biomart="ensembl",dataset="hsapiens_gene_ensembl")
# 得到go信息和gene
gene2goInfo <- getBM(attributes=c('ensembl_gene_id','go_id','entrezgene','name_1006','go_linkage_type','namespace_1003'), mart = genes)
# 过滤
gene2goInfo=gene2goInfo[gene2goInfo$go_id != "", ]
# 上述是一个gene对应一个go id，需要合并为一个gene对应多个go id，利用by函数（神器）
geneID2GO = by(gene2goInfo$go_id, gene2goInfo$ensembl_gene_id, function(x) as.character(x))
save.image("dat.RData")

geneID2GO是符合topGO要求的，gene2goInfo和geneID2GO的格式如下

# the data is very large, so we directe load
load("dat.RData")
head(gene2goInfo)

##   ensembl_gene_id      go_id entrezgene
## 3 ENSG00000281614 GO:0004445       3635
## 4 ENSG00000281614 GO:0005515       3635
## 5 ENSG00000281614 GO:0016314       3635
## 6 ENSG00000281614 GO:0016787       3635
## 7 ENSG00000281614 GO:0017124       3635
## 8 ENSG00000281614 GO:0034485       3635
##                                                         name_1006
## 3                   inositol-polyphosphate 5-phosphatase activity
## 4                                                 protein binding
## 5 phosphatidylinositol-3,4,5-trisphosphate 3-phosphatase activity
## 6                                              hydrolase activity
## 7                                              SH3 domain binding
## 8 phosphatidylinositol-3,4,5-trisphosphate 5-phosphatase activity
##   go_linkage_type     namespace_1003
## 3             TAS molecular_function
## 4             IPI molecular_function
## 5             TAS molecular_function
## 6             IEA molecular_function
## 7             IEA molecular_function
## 8             TAS molecular_function

head(geneID2GO)

## $ENSG00000000003
##  [1] "GO:0070062" "GO:0016021" "GO:0016020" "GO:0005887" "GO:1901223"
##  [6] "GO:0043123" "GO:0039532" "GO:0007166" "GO:0005515" "GO:0004871"
## 
## $ENSG00000000005
##  [1] "GO:0001886" "GO:0001937" "GO:0016525" "GO:0035990" "GO:0071773"
##  [6] "GO:0005634" "GO:0005635" "GO:0005737" "GO:0016020" "GO:0016021"
## 
## $ENSG00000000419
##  [1] "GO:0033185" "GO:0016020" "GO:0005789" "GO:0005789" "GO:0005783"
##  [6] "GO:0005783" "GO:0005634" "GO:0035269" "GO:0035268" "GO:0019348"
## [11] "GO:0018279" "GO:0006506" "GO:0006487" "GO:0006486" "GO:0005515"
## [16] "GO:0004582" "GO:0004582" "GO:0004169" "GO:0043231" "GO:0097502"
## [21] "GO:0019673" "GO:0019348" "GO:0043178" "GO:0016757" "GO:0016740"
## [26] "GO:0005537"
## 
## $ENSG00000000457
##  [1] "GO:0016301" "GO:0005515" "GO:0005524" "GO:0006468" "GO:0016477"
##  [6] "GO:0005737" "GO:0005737" "GO:0005794" "GO:0005794" "GO:0030027"
## [11] "GO:0030027" "GO:0042995" "GO:0004672"
## 
## $ENSG00000000938
##  [1] "GO:0070062" "GO:0042995" "GO:0032587" "GO:0031234" "GO:0016020"
##  [6] "GO:0015629" "GO:0005886" "GO:0005886" "GO:0005829" "GO:0005829"
## [11] "GO:0005758" "GO:0005743" "GO:0005739" "GO:0005737" "GO:0050830"
## [16] "GO:0050830" "GO:0050764" "GO:0050764" "GO:0050715" "GO:0050715"
## [21] "GO:0046777" "GO:0045859" "GO:0045859" "GO:0045088" "GO:0045088"
## [26] "GO:0045087" "GO:0045087" "GO:0043552" "GO:0043306" "GO:0043306"
## [31] "GO:0042127" "GO:0038096" "GO:0038083" "GO:0030335" "GO:0030335"
## [36] "GO:0030154" "GO:0018108" "GO:0018108" "GO:0016477" "GO:0016310"
## [41] "GO:0014068" "GO:0009615" "GO:0008360" "GO:0008360" "GO:0007229"
## [46] "GO:0007229" "GO:0007229" "GO:0007169" "GO:0006468" "GO:0006468"
## [51] "GO:0002768" "GO:0002376" "GO:0034988" "GO:0034987" "GO:0034987"
## [56] "GO:0019901" "GO:0019901" "GO:0016740" "GO:0016301" "GO:0005524"
## [61] "GO:0005515" "GO:0004715" "GO:0004715" "GO:0004715" "GO:0004713"
## [66] "GO:0004713" "GO:0004713" "GO:0004713" "GO:0004672" "GO:0001784"
## [71] "GO:0001784" "GO:0000166" "GO:0032587" "GO:0015629" "GO:0005856"
## [76] "GO:0005856"
## 
## $ENSG00000000971
##  [1] "GO:0072562" "GO:0070062" "GO:0005615" "GO:0005576" "GO:0005576"
##  [6] "GO:0045087" "GO:0030449" "GO:0006957" "GO:0006956" "GO:0002376"
## [11] "GO:0043395" "GO:0008201" "GO:0005515"

2.2 构建基因列表：geneList

# cerate all genes for test
all_genes <- unique(head(gene2goInfo$ensembl_gene_id,40000))
DE_genes <- all_genes[sample(1:length(all_genes),300,replace = F)]
# constrct the geneList
geneList <- factor(as.integer(all_genes %in% DE_genes))
names(geneList) <- all_genes

If you read data from external data, you can use following code!

diff=read.table("./gene_exp.diff",sep="\t",header=TRUE)
# 差异表达基因
interesting_genes=factor(diff[diff$significant=="yes",]$gene_id)
# 所有基因
all_genes <- diff$gene_id
# 构建基因列表
geneList <- factor(as.integer (all_genes %in% interesting_genes))
names(geneList)=all_genes

2.3 构建topGO对象

library(topGO)
GOdata <- new("topGOdata", ontology = "MF", allGenes = geneList,annot = annFUN.gene2GO, gene2GO = geneID2GO)
# You may confuse the parameter annot, don't worry. YOU CAN FOUND the explaination in the official reference

2.4 基因富集分析

resultFisher <- runTest(GOdata, algorithm = "classic", statistic = "fisher")

## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 611 nontrivial nodes
##       parameters: 
##           test statistic:  fisher

allRes <- GenTable(GOdata,classicFisher = resultFisher, orderBy = "classicFisher", topNodes = 10)
knitr::kable(as.data.frame(allRes))

GO.ID	Term	Annotated	Significant	Expected	classicFisher
GO:0004872	receptor activity	188	39	26.83	0.0067
GO:0004930	G-protein coupled receptor activity	98	23	13.98	0.0083
GO:0060089	molecular transducer activity	218	43	31.11	0.0113
GO:0004888	transmembrane signaling receptor activit…	148	31	21.12	0.0134
GO:1901618	organic hydroxy compound transmembrane t…	8	4	1.14	0.0177
GO:0008020	G-protein coupled photoreceptor activity	2	2	0.29	0.0203
GO:0000400	four-way junction DNA binding	5	3	0.71	0.0230
GO:0038023	signaling receptor activity	158	31	22.55	0.0327
GO:0004984	olfactory receptor activity	43	11	6.14	0.0339
GO:0004871	signal transducer activity	188	35	26.83	0.0488

2.5 produce the DAG for viusualization

# svg("go.dag.svg")
# pdf("test.pdf")
showSigOfNodes(GOdata, score(resultFisher), firstSigNodes = 5, useInfo = 'all')

## Loading required package: Rgraphviz

## Loading required package: grid

## 
## Attaching package: 'grid'

## The following object is masked from 'package:topGO':
## 
##     depth

## $dag
## A graphNEL graph with directed edges
## Number of Nodes = 10 
## Number of Edges = 10 
## 
## $complete.dag
## [1] "A graph with 10 nodes."

# dev.off()
###########
data(GOdata)
data(results.tGO)

showSigOfNodes(GOdata, score(resultFisher), firstSigNodes = 6, useInfo = 'all')

## $dag
## A graphNEL graph with directed edges
## Number of Nodes = 45 
## Number of Edges = 86 
## 
## $complete.dag
## [1] "A graph with 45 nodes."

# directly save the picture without call the PDF device
# printGraph(GOdata, resultFisher, firstSigNodes = 5, fn.prefix = "sampleFile", useInfo = "all", pdfSW = TRUE)

TopGO做GO富集并可视化

Your Document Title

Document Author

2016-12-24