MeSH analysis

Beef cattle data

Li et al. (2015) reported a list of differentially expressed genes in the ruminal wall of grass-fed and grain-fed cattle from the RNA-seq study. Visit the journal website and download the file S3 Table.

1. Required R/Bioconductor packages

library("biomaRt")
library("readxl")
library("org.Bt.eg.db")
library("GOstats")
library("meshr")
library("MeSH.db")
library("MeSH.Bta.eg.db")
library("corrplot")
library("MeSHSim")
library("GOSemSim")
library("tagcloud")
library("RColorBrewer")

2. Create a vector of background genes

We first create a vector of background genes. Note that Biomart has recently changed its server from www.biomart.org to www.ensembl.org.

## access to biomaRt
mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "btaurus_gene_ensembl", 
    host = "www.ensembl.org")
univ.geneID <- getBM(attributes = c("ensembl_gene_id", "entrezgene", "hgnc_symbol"), 
    mart = mart)  # 25898  
## remove genes with no corresponding Entrez Gene ID
univ.geneID2 <- univ.geneID[!is.na(univ.geneID[, 2]), ]  # 18279 
## remove duplicated Entrez Gene ID
univ.geneID3 <- univ.geneID2[!duplicated(univ.geneID2[, 2]), ]  # 17745
load("univgeneID3.Rda")

3. Create a vector of selected genes

Secondly, we create a vector of selected genes by reading the input file.

## read data
my.geneID <- read_excel("journal.pone.0116437.s004.XLS", col_names = TRUE, sheet = 1, 
    na = "NA")  # 342
colnames(my.geneID)[1] <- "ensembl_gene_id"
## merge two files
my.geneID2 <- merge(my.geneID, univ.geneID3, by = "ensembl_gene_id")  # 317
## remove duplicated Entrez Gene ID
my.geneID3 <- my.geneID2[!duplicated(my.geneID2$entrezgene), ]  # 317

4. GO enrichment analysis

We perform a GO analysis using the GOstats package.

paraGO <- new("GOHyperGParams", geneIds = my.geneID3[, 6], universeGeneIds = univ.geneID3[, 
    2], annotation = "org.Bt.eg.db", ontology = "BP", pvalueCutoff = 0.05, conditional = TRUE, 
    testDirection = "over")

GO enrichment analysis for Biological Process (BP)

BP <- hyperGTest(paraGO)
summary(BP)[, c(1, 2, 7)]

GO enrichment analysis for Molecular Function (MF)

ontology(paraGO) <- "MF"
MF <- hyperGTest(paraGO)
summary(MF)[, c(1, 2, 7)]

GO enrichment analysis for Cellular Component (CC)

ontology(paraGO) <- "CC"
CC <- hyperGTest(paraGO)
summary(CC)[, c(1, 2, 7)]

5. MeSH enrichment analysis

Then, we perform a MeSH ORA for the category Chemicals and Drugs by setting ‘category=“D”’.

meshParams <- new("MeSHHyperGParams", geneIds = my.geneID3[, 6], universeGeneIds = univ.geneID3[, 
    2], annotation = "MeSH.Bta.eg.db", category = "D", database = "gene2pubmed", 
    pvalueCutoff = 0.05, pAdjust = "none")
meshR.D <- meshHyperGTest(meshParams)
head(summary(meshR.D))
summary(meshR.D)[!duplicated(summary(meshR.D)[, 7]), c(1, 2, 7)]

Switching to a different category is easily done by the ‘category<-’ function. Here, we use Diseases (category = “C”).

category(meshParams) <- "C"
meshR.C <- meshHyperGTest(meshParams)
head(summary(meshR.C))
summary(meshR.C)[!duplicated(summary(meshR.C)[, 7]), c(1, 2, 7)]

MeSH ORA for Anatomy (category = “A”).

category(meshParams) <- "A"
meshR.A <- meshHyperGTest(meshParams)
head(summary(meshR.A))
summary(meshR.A)[!duplicated(summary(meshR.A)[, 7]), c(1, 2, 7)]

MeSH ORA for Phenomena and Processes (category = “G”).

category(meshParams) <- "G"
meshR.G <- meshHyperGTest(meshParams)
head(summary(meshR.G))
summary(meshR.G)[!duplicated(summary(meshR.G)[, 7]), c(1, 2, 7)]

6. Visualization for MeSH enrichment analysis

library(tagcloud)
meshR <- meshR.D
meshR.summary <- summary(meshR)[!duplicated(summary(meshR)[, 7]), ]
tags <- strmultline(meshR.summary$MESHTERM)
weights <- -log(meshR.summary$Pvalue)
or <- meshR.summary$OddsRatio
colors <- smoothPalette(or, max = 4)
tagcloud(tags, weights = weights, col = colors)

library(RColorBrewer)
colors <- smoothPalette(weights, pal = brewer.pal(11, "Spectral"))
tagcloud(tags, weights = weights, col = colors, order = "size")

7. GO semantic similarity analysis

We perform a GO semantic similarity analysis using the GOSemSim package.

library("GOSemSim")

GO semantic similarity analysis for Biological Process (BP)

library(corrplot)
goListBP <- summary(BP)[, c(1)][1:50]  # top 50
goSimMatBP <- mgoSim(goListBP, goListBP, ont = "BP", measure = "Jiang", organism = "bovine", 
    combine = NULL)
goSimMatBP[is.na(goSimMatBP)] <- 0
corrplot(goSimMatBP, is.corr = FALSE, type = "lower", tl.col = "black", tl.cex = 0.5)

GO semantic similarity analysis for Molecular Function (MF)

goListMF <- summary(MF)[, c(1)]  # 41
goSimMatMF <- mgoSim(goListMF, goListMF, ont = "MF", measure = "Jiang", organism = "bovine", 
    combine = NULL)
corrplot(goSimMatMF, is.corr = FALSE, type = "lower", tl.col = "black", tl.cex = 0.8)

GO semantic similarity analysis for Cellular Component (CC)

goListCC <- summary(CC)[, c(1)]  # 17
goSimMatCC <- mgoSim(goListCC, goListCC, ont = "CC", measure = "Jiang", organism = "bovine", 
    combine = NULL)
corrplot(goSimMatCC, is.corr = FALSE, type = "lower", tl.col = "black", tl.cex = 0.8)

8. MeSH semantic similarity analysis

Then, we perform a MeSH semantic similarity for the category Chemicals and Drugs by setting ‘category=“D”’.

headingListD <- summary(meshR.D)[!duplicated(summary(meshR.D)[, 7]), c(7)]
meshSimMatD <- mheadingSim(headingListD, headingListD, method = "JC")
rownames(meshSimMatD) <- colnames(meshSimMatD) <- summary(meshR.D)[!duplicated(summary(meshR.D)[, 
    7]), c(1)]
indexD <- which(meshSimMatD > 0.35 & meshSimMatD != 1, arr.ind = TRUE)
corrplot(meshSimMatD[unique(rownames(meshSimMatD)[indexD[, 1]]), unique(rownames(meshSimMatD)[indexD[, 
    1]])], is.corr = FALSE, type = "lower", tl.col = "black", tl.cex = 0.8)

Switching to a different category is easily done by the ‘category<-’ function. Here, we use Diseases (category = “C”).

headingListC <- summary(meshR.C)[!duplicated(summary(meshR.C)[, 7]), c(7)]
meshSimMatC <- mheadingSim(headingListC, headingListC, method = "JC")
rownames(meshSimMatC) <- colnames(meshSimMatC) <- summary(meshR.C)[!duplicated(summary(meshR.C)[, 
    7]), c(1)]
corrplot(meshSimMatC, is.corr = FALSE, type = "lower", tl.col = "black", tl.cex = 0.8)

MeSH ORA for Anatomy (category = “A”).

headingListA <- summary(meshR.A)[!duplicated(summary(meshR.A)[, 7]), c(7)]
meshSimMatA <- mheadingSim(headingListA, headingListA, method = "JC")
rownames(meshSimMatA) <- colnames(meshSimMatA) <- summary(meshR.A)[!duplicated(summary(meshR.A)[, 
    7]), c(1)]
indexA <- which(meshSimMatA > 0.15 & meshSimMatA != 1, arr.ind = TRUE)
corrplot(meshSimMatA[unique(rownames(meshSimMatA)[indexA[, 1]]), unique(rownames(meshSimMatA)[indexA[, 
    1]])], is.corr = FALSE, type = "lower", tl.col = "black", tl.cex = 0.8)

MeSH ORA for Phenomena and Processes (category = “G”).

headingListG <- summary(meshR.G)[!duplicated(summary(meshR.G)[, 7]), c(7)]
meshSimMatG <- mheadingSim(headingListG, headingListG, method = "JC")
rownames(meshSimMatG) <- colnames(meshSimMatG) <- summary(meshR.G)[!duplicated(summary(meshR.G)[, 
    7]), c(1)]
indexG <- which(meshSimMatG > 0.1 & meshSimMatG != 1, arr.ind = TRUE)
corrplot(meshSimMatG[unique(rownames(meshSimMatG)[indexG[, 1]]), unique(rownames(meshSimMatG)[indexG[, 
    1]])], is.corr = FALSE, type = "lower", tl.col = "black", tl.cex = 0.8)

9. GO-based Gene Semantic Similarity

Compute Gene semantic similarity based on GO annotation from BP.

geneSimBP <- mgeneSim(genes = c(my.geneID3[, 6]), ont = "BP", organism = "bovine", 
    measure = "Jiang", verbose = FALSE, drop = NULL)  # 90
symbolBP <- select(org.Bt.eg.db, rownames(geneSimBP), column = "SYMBOL", keytype = "ENTREZID")
rownames(geneSimBP) <- colnames(geneSimBP) <- symbolBP[, 2]
index.geneSimBP <- which(geneSimBP > 0.8 & geneSimBP != 1, arr.ind = TRUE)
corrplot(geneSimBP, is.corr = FALSE, type = "lower", tl.col = "black", tl.cex = 0.8)
corrplot(geneSimBP[unique(rownames(geneSimBP)[index.geneSimBP[, 1]]), unique(rownames(geneSimBP)[index.geneSimBP[, 
    1]])], is.corr = FALSE, type = "lower", tl.col = "black", tl.cex = 0.7)

10. Random set of genes

How many siginificant GO/MeSH terms are identified from a random set of genes?

set.seed(100)
nGO <- array()
nMeSH <- array()

# sample random set of genes
for (i in 1:5) {
    cat("iteration ", i, "\n")
    ind <- sample(1:nrow(univ.geneID3), 300)
    random.geneID <- univ.geneID3[ind, ]
    
    # GO ORA for BP
    paraGO <- new("GOHyperGParams", geneIds = random.geneID[, 2], universeGeneIds = univ.geneID3[, 
        2], annotation = "org.Bt.eg.db", ontology = "BP", pvalueCutoff = 0.05, 
        conditional = TRUE, testDirection = "over")
    BP <- hyperGTest(paraGO)
    nGO[i] <- nrow(summary(BP)[, c(1, 2, 7)])
    
    # MeSH ORA for Chemicals and Drugs
    meshParams <- new("MeSHHyperGParams", geneIds = random.geneID[, 2], universeGeneIds = univ.geneID3[, 
        2], annotation = "MeSH.Bta.eg.db", category = "D", database = "gene2pubmed", 
        pvalueCutoff = 0.05, pAdjust = "none")
    meshR.D <- meshHyperGTest(meshParams)
    nMeSH[i] <- nrow(summary(meshR.D)[!duplicated(summary(meshR.D)[, 7]), c(1, 
        2, 7)])
}

# average number of GO terms
mean(nGO)
# average number of MeSH terms
mean(nMeSH)

11. Session Information

sessionInfo()
R version 3.2.4 (2016-03-10)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.5 (Yosemite)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] rmdformats_0.2 knitr_1.12.3  

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.4     digest_0.6.9    mime_0.4        R6_2.1.2       
 [5] xtable_1.8-2    formatR_1.3     magrittr_1.5    evaluate_0.8.3 
 [9] highr_0.5.1     stringi_1.0-1   miniUI_0.1.1    rstudioapi_0.5 
[13] rmarkdown_0.9.5 tools_3.2.4     stringr_1.0.0   questionr_0.5  
[17] shiny_0.13.2    httpuv_1.3.3    yaml_2.1.13     htmltools_0.3.5

Gota Morota, Matt Spangler

May 20, 2016