-
Notifications
You must be signed in to change notification settings - Fork 0
/
shiny_compound.R
255 lines (170 loc) · 9.26 KB
/
shiny_compound.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#' The function collects alle Pubchem-annotated synonyms for a given CID by means of a RESTful HTTP request.
#' @param cid The Pubchem CID of the compound.
#' @return A list containing the CIDs and the synonyms.
get_synonyms_by_cid <- function( cid){
## create a http request which is used to retrieve all synonyms of a given compound
## and the CID
request <- paste0( pubchem_compound, cid, "/synonyms/JSON")
content <- httr::content( GET( request))
cid <- content$InformationList$Information[[1]]$CID
synonyms <- unlist( content$InformationList$Information[[1]]$Synonym)
return( list( cid=cid, synonyms=synonyms))
}
#' The function collects all IDs that are published at the Pubchem website.
#' By means of a RESTful HTTP request through the CID, alles descriptions, that are linked at the Pubchem are evaluated.
#' Based on these links, the IDs that are used at these sources can be retrieved.
#' @param cid Pubchem ID of the compound of interest.
#' @return A list containing the IDs, ID source, and a functional link to the compound page at this source.
get_description_by_cid <- function( cid){
request <- paste0( pubchem_compound, cid, "/description/JSON")
content <- httr::content( GET( request))
url <- unlist(sapply( content$InformationList$Information, function(x) x$DescriptionURL))
sources <- unlist(sapply( content$InformationList$Information, function(x) x$DescriptionSourceName))
## remove FDA IDs and URLs since these do not have parsable unique identifier
if( length( grep( "FDA", sources)) > 0){
url <- url[-grep( "FDA", sources)]
sources <- sources[ -grep( "FDA", sources)]
}
## some kind of cleaning
sources <- gsub( "Human Metabolome Database \\(HMDB\\)", "HMDB", sources)
sources <- gsub( "CAMEO Chemicals", "CAMEO", sources)
## get the corresponding IDs from the urls
ids <- gsub( ".*\\/", "", url)
ids <- gsub( ".*=", "", ids)
## turn the url into a functional link which can be clicked in the output table
functional_link <- sprintf( '<a href=\"%s\" target=\"_blank\">%s</a>', url, ids)
## try to retrieve the other MeSH identifier
## current MeSH ID looks like this: 68001241
## but MeSH IDs used by CTDbase are more like this: D001241
## therefore, use the RESTful API from NCBI to get a mapping
if( "MeSH" %in% sources ){
meshid <- ids[ grep( "MeSH", sources)]
## create request object for the current MeSH ID
## and get the json object
request <- paste0( "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=mesh&id=",
meshid,"&retmode=json")
content <- httr::content( GET( request))
## check if another identifier is in the retrieved json object
if( "ds_meshui" %in% names(content$result[[ meshid]])){
second_meshid <- content$result[[ meshid]]$ds_meshui
## add the second identifier to the list of IDs, but this time without a functional link
functional_link[ grep( "MeSH", sources)] <- paste0( functional_link[ grep( "MeSH", sources)], ", ", second_meshid)
}
}
return ( list( sources = sources, urls = url, ids = ids, functional_link = functional_link))
}
#' The function returns all pubchem entries that are found with a given compound name.
#' By means of a RESTful HTTP request, all entries that are found with a string search are returned.
#' @param name Compound to be used for searching omics data sets.
#' @return A data.frame collecting CIDs and compound names.
get_cids_by_name <- function( name){
## create a http request which is used to retrieve all synonyms of a given compound
## and the CID
name <- gsub( " ", "%20", name)
request <- paste0( "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/",
name, "/synonyms/JSON?name_type=word")
content <- httr::content( GET( request))
if( "Fault" %in% names(content)){
return( data.frame( cid = c(0000), name = c("EMPTY") ))
}
cid <- sapply( content$InformationList$Information, function(x) x$CID)
pattern <- paste0( "^\\D*\\d\\D*", name, "\\D*$", "|" ,"^\\D*", name, "\\D*\\d\\D*$", "|", "^\\D*", name, "\\D*$")
synonyms_list <- sapply( content$InformationList$Information, function(x) x$Synonym)
synonyms <- sapply( synonyms_list, function(x){ x[grep( pattern, x, ignore.case = TRUE)[1]] })
## remove the NULL elements from synonyms and, beforehand, the respective IDs from the cid list
cid <- cid[!sapply( synonyms, is.null)]
synonyms[sapply( synonyms, is.null)] <- NULL
if( length( cid) > 1){
results <- data.frame( cid = cid, name = unlist( synonyms) )
} else{
results <- data.frame( cid = c(0000), name = c("EMPTY"))
}
return( arrange( results, cid))
}
#' This function collects important compound-related information from pubchem based on the compound CID.
#' By means of a RESTful HTTP request retrieved information contain:
#' Molecular Formular, SMILE, InChI, and the Definition (including the source)
#' @param cid The Pubchem compound ID (CID)
#' @return Dataframe with two columns (property and value)
#' @examples
#' get_compound_information( cid = 702)
#' get_compound_information( 2244)
get_compound_information <- function( cid){
## get canonical information from the properties table
request <- paste0( pubchem_compound, cid, "/property/MolecularFormula,CanonicalSMILES,InChI/JSON")
content <- httr::content( GET( request))
properties <- content$PropertyTable$Properties[[1]]
properties$CID <- NULL
properties$InChI <- gsub( "InChI=", "", properties$InChI)
## get also the description
request <- paste0( pubchem_compound, cid, "/description/JSON")
content <- httr::content( GET( request))
desc_list <- content$InformationList$Information
## get a list of potential description sources
sources <- lapply( desc_list, function(x){ x$DescriptionSourceName})
## check if one of the following sources provides a compound description: MeSH, ChEBI, DrugBank, HMDB, etc...
reference_sources <- c( "MeSH", "DrugBank", "Human Metabolome Database (HMDB)", "CAMEO Chemicals", "NCIt", "ChEBI")
## get a mapping between the reference sources and the current sources
mapping <- BiocGenerics::match( reference_sources, sources)
## extract the actual information, i.e., the description and the corresponding source
position <- mapping[ !is.na(mapping)][1]
if( !is.na( position) && position >= 1 && position <= length( desc_list)){
properties$Description <- desc_list[[ position]]$Description
properties$DescriptionSource <- desc_list[[ position]]$DescriptionSourceName
return( data.frame( values = unlist( properties), row.names = c( "Molecular Formular", "Chanonical SMILE", "InChI", "Definition", "Definition Source"), stringsAsFactors = FALSE))
} else{
properties$Description <- NULL
properties$DescriptionSource <- NULL
return( data.frame( values = unlist( properties), row.names = c( "Molecular Formular", "Chanonical SMILE", "InChI"), stringsAsFactors = FALSE))
}
}
#' Search with a given compound name or compound ID in the Comptox database for overlapping hits to refine the search.
#' @param compound The search string
#' @return Dataframe containing CIDs and Names of potentially matching compounds
get_comptox_by_input <- function( compound){
## check if the input name is a string or an ID
## in case no ID pattern matches, assume the compound is a string
if( length( grep( "^DTX[SC]ID\\d+$", compound)) > 0){
m <- unique( c( match( compound, comptox$DTXSID), match( compound, comptox$DTXCID)))
} else if( length( grep( "^\\d+$", compound)) > 0){
m <- match( compound, comptox$CID)
} else if( length( grep( "^\\d+-\\d\\d-\\d$", compound)) > 0){
## check if cas number is correct
if( !is_cas_correct( compound)){
showModal( modalDialog(
title = "CAS number ERROR",
"The entered CAS number is not correct!",
easyClose = TRUE
))
return( data.frame( cid=c(0000), name=c("CAS")))
}
m <- match( compound, comptox$CAS)
} else{
## assume the entered compound is a name
## -> pattern matching with columns NAME ans SYNONYM
sids_comptox <- comptox$CTXSID[grep( compound, comptox$NAME, ignore.case = TRUE)]
sids_synonyms <- unique( synonyms$DTXSID[ grep( compound, synonyms$SYNONYM, ignore.case = TRUE)])
sids <- unique( c( sids_comptox, sids_synonyms))
m <- match( sids, comptox$DTXSID)
}
m <- m[ !is.na( m)]
if( length( m) > 0){
df <- data.frame( cid = as.integer(comptox$CID[ m]), name = comptox$NAME[ m])
} else{
df <- data.frame( cid=c(0000), name=c("EMPTY"))
}
return( arrange( df, cid))
}
#' Retrieve all IDs from Comptox dashboard that are mapped to a given pubchem ID.
#' @param cid The input pubchem ID
#' @return A dataframe with 2 columns for ID source and ID.
#' @example get_comptox_ids_by_cid( 702)
get_comptox_ids_by_cid <- function( cid){
m <- match( cid, comptox$CID)
m <- m[ !is.na( m)]
if( length( m) == 0){
return()
}
df <- data.frame( Database = c( "CAS-RN", "Comptox Dashboard"), IDs = c( comptox$CAS[ m], c( sprintf( '<a href=\"https://comptox.epa.gov/dashboard/%s\" target=\"_blank\">%s</a>', comptox$DTXSID[m], comptox$DTXSID[m]))))
return( df)
}