diff --git a/R/buildRecord.R b/R/buildRecord.R index 1268344..1fcaae7 100644 --- a/R/buildRecord.R +++ b/R/buildRecord.R @@ -284,10 +284,10 @@ setMethod("buildRecord", "RmbSpectrum2", function(o, ..., cpd = NULL, mbdata = l # Here is the right place to fix the name of the INTERNAL ID field. if(!is.null(getOption("RMassBank")$annotations$internal_id_fieldname)) { - id.col <- which(names(mbdata[["COMMENT"]]) == "ID") - if(length(id.col) > 0) + id_col <- which(names(mbdata[["COMMENT"]]) == "ID") + if(length(id_col) > 0) { - names(mbdata[["COMMENT"]])[[id.col]] <- + names(mbdata[["COMMENT"]])[[id_col]] <- getOption("RMassBank")$annotations$internal_id_fieldname } } diff --git a/R/createMassBank.R b/R/createMassBank.R index 63bccf2..6dcf92a 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -90,14 +90,14 @@ loadInfolist <- function(mb, fileName) # dbname_e will be dropped because of the select= in the subset below. } - if("COMMENT.EAWAG_UCHEM_ID" %in% colnames(mbdata_new)) { - colnames(mbdata_new)[[which(colnames(mbdata_new)== "COMMENT.EAWAG_UCHEM_ID")]] <- - "COMMENT.ID" + if("COMMENT_EAWAG_UCHEM_ID" %in% colnames(mbdata_new)) { + colnames(mbdata_new)[[which(colnames(mbdata_new) == "COMMENT_EAWAG_UCHEM_ID")]] <- + "COMMENT_ID" } # use only the columns present in mbdata_archive, no other columns added in excel col_names <- colnames(mb@mbdata_archive) - comment_colnames <- colnames(mbdata_new)[grepl(x = colnames(mbdata_new), pattern = "^COMMENT\\.(?!CONFIDENCE)(?!ID)", perl = TRUE)] + comment_colnames <- colnames(mbdata_new)[grepl(x = colnames(mbdata_new), pattern = "^COMMENT\\_(?!CONFIDENCE)(?!ID)", perl = TRUE)] col_names <- c(col_names, comment_colnames) ## The read infolists might not have all required / expected columns @@ -138,8 +138,8 @@ resetInfolists <- function(mb) { mb@mbdata_archive <- structure(list(id = integer(0), dbcas = character(0), - dbname = character(0), dataused = character(0), COMMENT.CONFIDENCE = character(0), - COMMENT.ID = integer(0), `CH$NAME1` = character(0), + dbname = character(0), dataused = character(0), COMMENT_CONFIDENCE = character(0), + COMMENT_ID = integer(0), `CH$NAME1` = character(0), `CH$NAME2` = character(0), `CH$NAME3` = character(0), `CH$NAME4` = character(0), `CH$NAME5` = character(0), `CH$COMPOUND_CLASS` = character(0), `CH$FORMULA` = character(0), `CH$EXACT_MASS` = numeric(0),` CH$SMILES` = character(0), @@ -149,7 +149,7 @@ resetInfolists <- function(mb) `CH$LINK_CHEMSPIDER` = integer(0), `CH$LINK_COMPTOX` = character(0), AUTHORS = character(0), COPYRIGHT = character(0) ), .Names = c("id", "dbcas", - "dbname", "dataused", "COMMENT.CONFIDENCE", "COMMENT.ID", + "dbname", "dataused", "COMMENT_CONFIDENCE", "COMMENT_ID", "CH$NAME1", "CH$NAME2", "CH$NAME3", "CH$NAME4", "CH$NAME5", "CH$COMPOUND_CLASS", "CH$FORMULA", "CH$EXACT_MASS", "CH$SMILES", "CH$IUPAC", "CH$LINK_CAS", "CH$LINK_CHEBI", "CH$LINK_HMDB", "CH$LINK_KEGG", "CH$LINK_LIPIDMAPS", "CH$LINK_PUBCHEM", @@ -254,7 +254,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c rmb_log_info("mbWorkflow: Step 2. Export infolist (if required)") if(length(mb@mbdata)>0) { - mbdata <- tibble::as_tibble(flatten(mb@mbdata)) + mbdata <- flatten(mb@mbdata) readr::write_csv(x = mbdata, file = infolist_path, col_names = TRUE, na = "", quote = "needed") rmb_log_info(paste("The file", infolist_path, "was generated with new compound information. Please check and edit the table, and add it to your infolist folder.")) return(mb) @@ -585,14 +585,19 @@ gatherData <- function(id) if(usebabel){ cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey') - inchikey_split <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE) - } else{ - inchikey <- getCactus(smiles, 'stdinchikey') + inchikey_split <- system(cmdinchikey, intern = TRUE, input = smiles, ignore.stderr = TRUE) + } else { + inchi_key <- getCactus(identifier = smiles, representation = "stdinchikey") + + if(is.na(inchi_key)) { + inchi_key <- getPcInchiKey(query = smiles, from = "smiles") + } + if(!is.na(inchikey)){ ##Split the "InChiKey=" part off the key - inchikey_split <- strsplit(inchikey, "=", fixed=TRUE)[[1]][[2]] - } else{ - inchikey_split <- getPcInchiKey(smiles) + inchikey_split <- strsplit(inchi_key, "=", fixed = TRUE)[[1]][[2]] + } else { + inchikey_split <- getPcInchiKey(query = smiles, from = "smiles") } } @@ -1120,7 +1125,7 @@ gatherDataUnknown <- function(id, mode, retrieval){ #' \code{\link{gatherData}}. #' @param row One row of MassBank compound information retrieved from an #' infolist. -#' @return \code{flatten} returns a matrix (not a data frame) to be written to +#' @return \code{flatten} returns a tibble (not a data frame or matrix) to be written to #' CSV. #' #' \code{readMbdata} returns a list of type \code{list(id= \var{compoundID}, @@ -1146,7 +1151,7 @@ flatten <- function(mbdata) .checkMbSettings() colNames <- names(unlist(mbdata[[1]])) - commentNames <- colNames[grepl(x = colNames, pattern = "^COMMENT\\.")] + commentNames <- colNames[grepl(x = colNames, pattern = "^COMMENT\\_")] colList <- c( "id", @@ -1154,11 +1159,11 @@ flatten <- function(mbdata) "dbname", "dataused", commentNames, - #"COMMENT.CONFIDENCE", + #"COMMENT_CONFIDENCE", # Note: The field name of the internal id field is replaced with the real name # at "compilation" time. Therefore, functions DOWNSTREAM from compileRecord() # must use the full name including the info from options("RMassBank"). - #"COMMENT.ID", + #"COMMENT_ID", "CH$NAME1", "CH$NAME2", "CH$NAME3", @@ -1177,27 +1182,35 @@ flatten <- function(mbdata) "CH$LINK_PUBCHEM", "CH$LINK_INCHIKEY", "CH$LINK_CHEMSPIDER", - "CH$LINK_COMPTOX" + "CH$LINK_COMPTOX" ) # make an empty data frame with the right length rows <- length(mbdata) cols <- length(colList) - mbframe <- matrix(data=NA, nrow=rows, ncol=cols) - colnames(mbframe) <- colList + + mbtbl <- tibble::tibble(!!!colList, .rows = 0, .name_repair = ~ colList) + + + #mbframe <- matrix(data = NA, nrow = rows, ncol = cols) + #colnames(mbframe) <- colList #browser() - for(row in 1:rows) - { + for(i in 1:rows) { # fill in all the data into the dataframe: all columns which # a) exist in the target dataframe and b) exist in the (unlisted) MB record # are written into the dataframe. - data <- unlist(mbdata[[row]]) + data <- unlist(mbdata[[i]], use.names = TRUE) + names(data) <- gsub("\\.", "_", names(data)) # bugfix for the case of only one name - if(!("CH$NAME1" %in% names(data))) - data[["CH$NAME1"]] <- data[["CH$NAME"]] - datacols <- intersect(colList, names(data)) - mbframe[row,datacols] <- data[datacols] + if(!("CH$NAME1" %in% names(data))) { + data[["CH$NAME1"]] <- data[["CH$NAME"]] + } + datacols <- intersect(colList, names(data)) + + mbtbl <- mbtbl |> dplyr::bind_rows(data[datacols]) + } - return(mbframe) + + return(mbtbl) } @@ -1231,8 +1244,8 @@ readMbdata <- function(row) # This is not very flexible, as you can see... colList <- c( commentNames, - #"COMMENT.CONFIDENCE", - #"COMMENT.ID", + #"COMMENT_CONFIDENCE", + #"COMMENT_ID", "CH$NAME1", "CH$NAME2", "CH$NAME3", @@ -1253,10 +1266,10 @@ readMbdata <- function(row) "CH$LINK_CHEMSPIDER", "CH$LINK_COMPTOX") mbdata[["COMMENT"]] = list() - #mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT.CONFIDENCE"]] + #mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT_CONFIDENCE"]] # Again, our ID field. - #mbdata[["COMMENT"]][["ID"]] <- row[["COMMENT.ID"]] - mbdata[["COMMENT"]][gsub(x = commentNames, pattern = "^COMMENT\\.", replacement = "")] <- row[commentNames] + #mbdata[["COMMENT"]][["ID"]] <- row[["COMMENT_D"]] + mbdata[["COMMENT"]][gsub(x = commentNames, pattern = "^COMMENT\\_", replacement = "")] <- row[commentNames] names = c(row[["CH$NAME1"]], row[["CH$NAME2"]], row[["CH$NAME3"]], row[["CH$NAME4"]], row[["CH$NAME5"]]) names = names[which(!is.na(names))] @@ -1283,8 +1296,8 @@ readMbdata <- function(row) mbdata[["CH$LINK"]] <- link ## SP$SAMPLE - if(all(nchar(row[["SP.SAMPLE"]]) > 0, row[["SP.SAMPLE"]] != "NA", !is.na(row[["SP.SAMPLE"]]), na.rm = TRUE)) - mbdata[['SP$SAMPLE']] <- row[["SP.SAMPLE"]] + if(all(nchar(row[["SP_SAMPLE"]]) > 0, row[["SP_SAMPLE"]] != "NA", !is.na(row[["SP_SAMPLE"]]), na.rm = TRUE)) + mbdata[['SP$SAMPLE']] <- row[["SP_SAMPLE"]] if(!is.na(row[["AUTHORS"]])) mbdata[["AUTHORS"]] = row[["AUTHORS"]] diff --git a/R/webAccess.R b/R/webAccess.R index b199fc3..2517c1c 100755 --- a/R/webAccess.R +++ b/R/webAccess.R @@ -1,570 +1,570 @@ -#' @import XML rjson httr -NULL -## library(XML) -## library(httr) -## library(jsonlite) - - -retrieveDataWithRetry <- function(url, timeout, maximumNumberOfRetries = 5, retryDelayInSeconds = 3){ - - data <- NULL - queryIsSuccessful <- FALSE - numberOfRetries <- 0 - while(!queryIsSuccessful & numberOfRetries < maximumNumberOfRetries){ - data <- tryCatch( - expr = { - res <- GET(utils::URLencode(url)) - data <- httr::content(res, type="text", encoding="UTF-8") - - queryIsSuccessful <- TRUE - data - }, - warning=function(w){ - numberOfRetries <<- numberOfRetries + 1 - if(RMassBank.env$verbose.output) - cat(paste("### Warning ### Web query failed (", numberOfRetries, " / ", maximumNumberOfRetries, ") for url '", url, "' because of warning '", w, "'\n", sep = "")) - if(numberOfRetries < maximumNumberOfRetries) - Sys.sleep(time = retryDelayInSeconds) - }, - error=function(e){ - numberOfRetries <<- numberOfRetries + 1 - if(RMassBank.env$verbose.output) - cat(paste("### Warning ### Web query failed (", numberOfRetries, " / ", maximumNumberOfRetries, ") for url '", url, "' because of error '", e, "'\n", sep = "")) - if(numberOfRetries < maximumNumberOfRetries) - Sys.sleep(time = retryDelayInSeconds) - } - ) - } - - return(data) -} - -#' Retrieve information from Cactus -#' -#' Retrieves information from the Cactus Chemical Identifier Resolver -#' (PubChem). -#' -#' It is not necessary to specify in which format the \code{identifier} is. -#' Somehow, cactus does this automatically. -#' -#' @usage getCactus(identifier, representation) -#' @param identifier Any identifier interpreted by the resolver, e.g. an InChI -#' key or a SMILES code. -#' @param representation The desired representation, as required from the -#' resolver. e.g. \code{stdinchikey}, \code{chemspider_id}, \code{formula}... -#' Refer to the webpage for details. -#' @return The result of the query, in plain text. Can be NA, or one or -#' multiple lines (character array) of results. -#' @note Note that the InChI key is retrieved with a prefix (\code{InChIkey=}), -#' which must be removed for most database searches in other databases (e.g. -#' CTS). -#' @author Michael Stravs -#' @seealso \code{\link{getCtsRecord}}, \code{\link{getPcId}} -#' @references cactus Chemical Identifier Resolver: -#' \url{http://cactus.nci.nih.gov/chemical/structure} -#' @examples -#' -#' # Benzene: -#' getCactus("C1=CC=CC=C1", "cas") -#' getCactus("C1=CC=CC=C1", "stdinchikey") -#' getCactus("C1=CC=CC=C1", "chemspider_id") -#' -#' @export -#' -#' -getCactus <- function(identifier,representation){ - identifier <- gsub('#', '%23', identifier) - ret <- tryCatch(httr::GET(paste("https://cactus.nci.nih.gov/chemical/structure/", - utils::URLencode(identifier), "/", representation, sep = "")), - error = function(e) NA) - if (all(is.na(ret))) - return(NA) - if (ret["status_code"] == 404) - return(NA) - ret <- httr::content(ret) - return(unlist(strsplit(ret, "\n"))) - -} - -#' Search Pubchem CID -#' -#' Retrieves PubChem CIDs for a search term. -#' -#' Only the first result is returned currently. \bold{The function should be -#' regarded as experimental and has not thoroughly been tested.} -#' -#' @usage getPcId(query, from = "inchikey") -#' @param query ID to be converted -#' @param from Type of input ID -#' @return The PubChem CID (in string type). -#' @author Michael Stravs, Erik Mueller -#' @seealso \code{\link{getCtsRecord}}, \code{\link{getCactus}} -#' @references PubChem search: \url{http://pubchem.ncbi.nlm.nih.gov/} -#' -#' Pubchem REST: -#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} -#' @examples -#' getPcId("MKXZASYAUGDDCJ-NJAFHUGGSA-N") -#' -#' @export -getPcId <- function(query, from = "inchikey") -{ - baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "description", "json", sep="/") - - errorvar <- 0 - currEnvir <- environment() - - tryCatch({ - res <- GET(utils::URLencode(url)) - data <- httr::content(res, type="text", encoding="UTF-8") - }, - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - # This happens if the InChI key is not found: - r <- fromJSON(data) - - if(!is.null(r$Fault)) - return(NA) - - titleEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Title)))) - - titleEntry <- titleEntry[which.min(sapply(titleEntry, function(x)r$InformationList$Information[[x]]$CID))] - - PcID <- r$InformationList$Information[[titleEntry]]$CID - - if(is.null(PcID)){ - return(NA) - } else{ - return(PcID) - } -} - - -#' Retrieve information from CTS -#' -#' Retrieves a complete CTS record from the InChI key. -#' -#' @usage getCtsRecord(key) -#' -#' @param key The InChI key. -#' @return Returns a list with all information from CTS: \code{inchikey, -#' inchicode, formula, exactmass} contain single values. \code{synonyms} contains -#' an unordered list of scored synonyms (\code{type, name, score}, where \code{type} -#' indicates either a normal name or a specific IUPAC name, see below). -#' \code{externalIds} contains an unordered list of identifiers of the compound in -#' various databases (\code{name, value}, where \code{name} is the database name and -#' \code{value} the identifier in that database.) -#' -#' @note Currently, the CTS results are still incomplete; the name scores are all 0, -#' formula and exact mass return zero. -#' @references Chemical Translation Service: -#' \url{https://cts.fiehnlab.ucdavis.edu} -#' -#' @examples -#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") -#' # show all synonym "types" -#' types <- unique(unlist(lapply(data$synonyms, function(i) i$type))) -#' \dontrun{print(types)} -#' -#' @author Michele Stravs, Eawag -#' @export -getCtsRecord <- function(key) -{ - baseURL <- "https://cts.fiehnlab.ucdavis.edu/service/compound/" - - errorvar <- 0 - currEnvir <- environment() - - ##tryCatch a CTS timeout - ## - tryCatch({ - url <- paste0(baseURL,key) - res <- GET(utils::URLencode(url)) - data <- httr::content(res, type="text", encoding="UTF-8") - }, - error=function(e){ - currEnvir$errorvar <- 1 - } - ) - - if(errorvar){ - warning("CTS seems to be currently unavailable or incapable of interpreting your request") - return(NULL) - } - - r <- fromJSON(data) - if(length(r) == 1) - if(r == "You entered an invalid InChIKey") - return(list()) - return(r) -} - -#' Convert a single ID to another using CTS. -#' -#' @usage getCtsKey(query, from = "Chemical Name", to = "InChIKey") -#' @param query ID to be converted -#' @param from Type of input ID -#' @param to Desired output ID -#' @return An unordered array with the resulting converted key(s). -#' -#' @examples -#' k <- getCtsKey("benzene", "Chemical Name", "InChIKey") -#' @author Michele Stravs, Eawag -#' @export -getCtsKey <- function(query, from = "Chemical Name", to = "InChIKey") -{ - baseURL <- "https://cts.fiehnlab.ucdavis.edu/service/convert" - url <- paste(baseURL, from, to, query, sep='/') - errorvar <- 0 - currEnvir <- environment() - - ##tryCatch a CTS timeout - ## - tryCatch({ - res <- GET(utils::URLencode(url)) - data <- httr::content(res, type="text", encoding="UTF-8") - }, - error=function(e){ - currEnvir$errorvar <- 1 - } - ) - - if(errorvar){ - warning("CTS seems to be currently unavailable or incapable of interpreting your request") - return(NULL) - } - - if(res$status_code != 200){ - warning(paste("CTS has return code", res$status_code)) - return(NULL) - } - - r <- fromJSON(data) - if(length(r) == 0) - return(NULL) - else - { - # read out the results in simplest form: - results <- unlist(lapply(r, function(row) row$result)) - return(results) - } -} - -#' Select a subset of external IDs from a CTS record. -#' -#' @usage CTS.externalIdSubset(data, database) -#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. -#' @param database The database for which keys should be returned. -#' @return Returns an array of all external identifiers stored in the record for the -#' given database. -#' -#' @examples -#' -#' \dontrun{ -#' # Return all CAS registry numbers stored for benzene. -#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") -#' cas <- CTS.externalIdSubset(data, "CAS") -#' } -#' -#' @author Michele Stravs, Eawag -#' @export -CTS.externalIdSubset <- function(data, database) -{ - select <- which(unlist(lapply(data$externalIds, function(id) - { - id[["name"]] == database - }))) - keyEntries <- data$externalIds[select] - keys <- unlist(lapply(keyEntries, function(e) e[["value"]])) -} - -#' Find all available databases for a CTS record -#' -#' @usage CTS.externalIdTypes(data) -#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. -#' @return Returns an array of all database names for which there are external -#' identifiers stored in the record. -#' -#' @examples -#' -#' \dontrun{ -#' # Return all databases for which the benzene entry has -#' # links in the CTS record. -#' -#' data <- getCTS("UHOVQNZJYSORNB-UHFFFAOYSA-N") -#' databases <- CTS.externalIdTypes(data) -#' } -#' -#' @author Michele Stravs, Eawag -#' @export -CTS.externalIdTypes <- function(data) -{ - unique(unlist(lapply(data$externalIds, function(id) - { - id[["name"]] - }))) -} - -.pubChemOnline <- function(){ - baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, "inchikey", "QEIXBXXKTUNWDK-UHFFFAOYSA-N", "description", "json", sep="/") - - errorvar <- 0 - currEnvir <- environment() - tryCatch({ - res <- GET(utils::URLencode(url)) - ret <- httr::content(res, type="text", encoding="UTF-8") - }, - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - warning("Pubchem is currently offline") - return(FALSE) - } else{ - return(TRUE) - } -} - - - -getPcCHEBI <- function(query, from = "inchikey") -{ - # Get the JSON-Data from Pubchem - baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "synonyms", "json", sep="/") - errorvar <- 0 - currEnvir <- environment() - - tryCatch({ - res <- GET(utils::URLencode(url)) - data <- httr::content(res, type="text", encoding="UTF-8") - }, - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - r <- fromJSON(data) - - # This happens if the InChI key is not found: - if(!is.null(r$Fault)) - return(NA) - - # Find the entries which contain Chebi-links - synonymEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Synonym)))) - synonymList <- r$InformationList$Information[[synonymEntry]]$Synonym - matchChebi <- which(grepl("CHEBI:", synonymList, fixed=TRUE)) - - # It doesn't matter if the db is down or if chebi isn't found, so return NA also - if(length(matchChebi) == 0){ - return (NA) - } else { - return (sapply(matchChebi, function(x) synonymList[[x]])) - } -} - -#' Retrieve the Chemspider ID for a given compound -#' -#' Given an InChIKey, this function queries the chemspider web API to retrieve -#' the Chemspider ID of he compound with that InChIkey. -#' -#' @usage getCSID(query) -#' -#' @param query The InChIKey of the compound -#' @return Returns the chemspide -#' -#' @examples -#' -#' \dontrun{ -#' # Return all CAS registry numbers stored for benzene. -#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") -#' cas <- CTS.externalIdSubset(data, "CAS") -#' } -#' -#' @author Michele Stravs, Eawag -#' @author Erik Mueller, UFZ -#' @export -getCSID <- function(query) -{ - baseURL <- "http://www.chemspider.com/InChI.asmx/InChIKeyToCSID?inchi_key=" - url <- paste0(baseURL, query) - - data <- retrieveDataWithRetry(url = utils::URLencode(url), timeout=8) - if(is.null(data)) { - warning("Chemspider is currently offline") - return(NA) - } - - xml <- xmlParseDoc(data,asText=TRUE) - # the returned XML document contains only the root node called "string" which contains the correct CSID - idNodes <- getNodeSet(xml, "/") - id <- xmlValue(idNodes[[1]]) - return(id) -} - -##This function returns a sensible name for the compound -getPcSynonym <- function (query, from = "inchikey") -{ - # Get the JSON-Data from Pubchem - baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "description", "json", sep="/") - - errorvar <- 0 - currEnvir <- environment() - - tryCatch({ - res <- GET(utils::URLencode(url)) - data <- httr::content(res, type="text", encoding="UTF-8") - }, - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - r <- fromJSON(data) - - # This happens if the InChI key is not found: - if(!is.null(r$Fault)) - return(NA) - - # Find the synonym - - titleEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Title)))) - - titleEntry <- titleEntry[which.min(sapply(titleEntry, function(x)r$InformationList$Information[[x]]$CID))] - - title <- r$InformationList$Information[[titleEntry]]$Title - - if(is.null(title)){ - return(NA) - } else{ - return(title) - } -} - - -##A function to retrieve a IUPAC Name from Pubchem -getPcIUPAC <- function (query, from = "inchikey") -{ - # Get the JSON-Data from Pubchem - baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "record", "json", sep="/") - - errorvar <- 0 - currEnvir <- environment() - - tryCatch({ - res <- GET(utils::URLencode(url)) - data <- httr::content(res, type="text", encoding="UTF-8") - }, - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - r <- fromJSON(data) - - # This happens if the InChI key is not found: - if(!is.null(r$Fault)) - return(NA) - - # Find the IUPAC-Names - if(!is.null(r$PC_Compounds[[1]]$props)){ - IUPACIndex <- which(unlist(lapply(r$PC_Compounds[[1]]$props, function(i) (i$urn$label == "IUPAC Name")))) - if(length(IUPACIndex) > 0){ - # Retrieve all IUPAC-Names - IUPACEntries <- lapply(IUPACIndex, function(x) r$PC_Compounds[[1]]$props[[x]]) - if(!is.null(IUPACEntries)){ - # Is there a preferred IUPAC-Name? If yes, retrieve that - PrefIUPAC <- which(unlist(lapply(IUPACEntries, function(x) x$urn$name == "Preferred"))) - } else{return(NA)} - } else{return(NA)} - } else{return(NA)} - - - if(length(PrefIUPAC) == 1){ - return(IUPACEntries[[PrefIUPAC]]$value$sval) - } else{ - # Else it doesn't matter which - return(IUPACEntries[[1]]$value$sval) - } -} - -getPcInchiKey <- function(query, from = "smiles"){ - # Get the JSON-Data from Pubchem - baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "record", "json", sep="/") - errorvar <- 0 - currEnvir <- environment() - - tryCatch({ - res <- GET(utils::URLencode(url)) - data <- httr::content(res, type="text", encoding="UTF-8") - }, - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - r <- fromJSON(data) - - # This happens if the InChI key is not found: - if(!is.null(r$Fault)) - return(NA) - - # Find the entries which contain Chebi-links - if(!is.null(r$PC_Compounds[[1]]$props)){ - INKEYindex <- which(sapply(r$PC_Compounds[[1]]$props, function(x) x$urn$label) == "InChIKey") - if(length(INKEYindex) > 0){ - return(r$PC_Compounds[[1]]$props[[INKEYindex]]$value$sval) - } else{return(NA)} - } else{return(NA)} - - -} - -getPcSDF <- function(query, from = "smiles"){ - baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "sdf", sep="/") - - errorvar <- 0 - currEnvir <- environment() - - tryCatch({ - res <- GET(utils::URLencode(url)) - data <- httr::content(res, type="text", encoding="UTF-8") - }, - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - molEnd <- regexpr(data,pattern="M END",fixed=TRUE)+5 - data <- c(strsplit(substring(data,1,molEnd),"\n")[[1]],"$$$$") - return(data) -} - +#' @import XML rjson httr +NULL +## library(XML) +## library(httr) +## library(jsonlite) + + +retrieveDataWithRetry <- function(url, timeout, maximumNumberOfRetries = 5, retryDelayInSeconds = 3){ + + data <- NULL + queryIsSuccessful <- FALSE + numberOfRetries <- 0 + while(!queryIsSuccessful & numberOfRetries < maximumNumberOfRetries){ + data <- tryCatch( + expr = { + res <- GET(utils::URLencode(url)) + data <- httr::content(res, type="text", encoding="UTF-8") + + queryIsSuccessful <- TRUE + data + }, + warning=function(w){ + numberOfRetries <<- numberOfRetries + 1 + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### Web query failed (", numberOfRetries, " / ", maximumNumberOfRetries, ") for url '", url, "' because of warning '", w, "'\n", sep = "")) + if(numberOfRetries < maximumNumberOfRetries) + Sys.sleep(time = retryDelayInSeconds) + }, + error=function(e){ + numberOfRetries <<- numberOfRetries + 1 + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### Web query failed (", numberOfRetries, " / ", maximumNumberOfRetries, ") for url '", url, "' because of error '", e, "'\n", sep = "")) + if(numberOfRetries < maximumNumberOfRetries) + Sys.sleep(time = retryDelayInSeconds) + } + ) + } + + return(data) +} + +#' Retrieve information from Cactus +#' +#' Retrieves information from the Cactus Chemical Identifier Resolver +#' (PubChem). +#' +#' It is not necessary to specify in which format the \code{identifier} is. +#' Somehow, cactus does this automatically. +#' +#' @usage getCactus(identifier, representation) +#' @param identifier Any identifier interpreted by the resolver, e.g. an InChI +#' key or a SMILES code. +#' @param representation The desired representation, as required from the +#' resolver. e.g. \code{stdinchikey}, \code{chemspider_id}, \code{formula}... +#' Refer to the webpage for details. +#' @return The result of the query, in plain text. Can be NA, or one or +#' multiple lines (character array) of results. +#' @note Note that the InChI key is retrieved with a prefix (\code{InChIkey=}), +#' which must be removed for most database searches in other databases (e.g. +#' CTS). +#' @author Michael Stravs +#' @seealso \code{\link{getCtsRecord}}, \code{\link{getPcId}} +#' @references cactus Chemical Identifier Resolver: +#' \url{http://cactus.nci.nih.gov/chemical/structure} +#' @examples +#' +#' # Benzene: +#' getCactus("C1=CC=CC=C1", "cas") +#' getCactus("C1=CC=CC=C1", "stdinchikey") +#' getCactus("C1=CC=CC=C1", "chemspider_id") +#' +#' @export +#' +#' +getCactus <- function(identifier, representation){ + identifier <- gsub('#', '%23', identifier) + ret <- tryCatch(httr::GET(paste("https://cactus.nci.nih.gov/chemical/structure/", + utils::URLencode(identifier), "/", representation, sep = "")), + error = function(e) NA) + if (all(is.na(ret))) + return(NA) + if (ret["status_code"] == 404) + return(NA) + ret <- httr::content(ret) + return(unlist(strsplit(ret, "\n"))) + +} + +#' Search Pubchem CID +#' +#' Retrieves PubChem CIDs for a search term. +#' +#' Only the first result is returned currently. \bold{The function should be +#' regarded as experimental and has not thoroughly been tested.} +#' +#' @usage getPcId(query, from = "inchikey") +#' @param query ID to be converted +#' @param from Type of input ID +#' @return The PubChem CID (in string type). +#' @author Michael Stravs, Erik Mueller +#' @seealso \code{\link{getCtsRecord}}, \code{\link{getCactus}} +#' @references PubChem search: \url{http://pubchem.ncbi.nlm.nih.gov/} +#' +#' Pubchem REST: +#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} +#' @examples +#' getPcId("MKXZASYAUGDDCJ-NJAFHUGGSA-N") +#' +#' @export +getPcId <- function(query, from = "inchikey") +{ + baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "description", "json", sep="/") + + errorvar <- 0 + currEnvir <- environment() + + tryCatch({ + res <- GET(utils::URLencode(url)) + data <- httr::content(res, type="text", encoding="UTF-8") + }, + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + # This happens if the InChI key is not found: + r <- fromJSON(data) + + if(!is.null(r$Fault)) + return(NA) + + titleEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Title)))) + + titleEntry <- titleEntry[which.min(sapply(titleEntry, function(x)r$InformationList$Information[[x]]$CID))] + + PcID <- r$InformationList$Information[[titleEntry]]$CID + + if(is.null(PcID)){ + return(NA) + } else{ + return(PcID) + } +} + + +#' Retrieve information from CTS +#' +#' Retrieves a complete CTS record from the InChI key. +#' +#' @usage getCtsRecord(key) +#' +#' @param key The InChI key. +#' @return Returns a list with all information from CTS: \code{inchikey, +#' inchicode, formula, exactmass} contain single values. \code{synonyms} contains +#' an unordered list of scored synonyms (\code{type, name, score}, where \code{type} +#' indicates either a normal name or a specific IUPAC name, see below). +#' \code{externalIds} contains an unordered list of identifiers of the compound in +#' various databases (\code{name, value}, where \code{name} is the database name and +#' \code{value} the identifier in that database.) +#' +#' @note Currently, the CTS results are still incomplete; the name scores are all 0, +#' formula and exact mass return zero. +#' @references Chemical Translation Service: +#' \url{https://cts.fiehnlab.ucdavis.edu} +#' +#' @examples +#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") +#' # show all synonym "types" +#' types <- unique(unlist(lapply(data$synonyms, function(i) i$type))) +#' \dontrun{print(types)} +#' +#' @author Michele Stravs, Eawag +#' @export +getCtsRecord <- function(key) +{ + baseURL <- "https://cts.fiehnlab.ucdavis.edu/service/compound/" + + errorvar <- 0 + currEnvir <- environment() + + ##tryCatch a CTS timeout + ## + tryCatch({ + url <- paste0(baseURL,key) + res <- GET(utils::URLencode(url)) + data <- httr::content(res, type="text", encoding="UTF-8") + }, + error=function(e){ + currEnvir$errorvar <- 1 + } + ) + + if(errorvar){ + warning("CTS seems to be currently unavailable or incapable of interpreting your request") + return(NULL) + } + + r <- fromJSON(data) + if(length(r) == 1) + if(r == "You entered an invalid InChIKey") + return(list()) + return(r) +} + +#' Convert a single ID to another using CTS. +#' +#' @usage getCtsKey(query, from = "Chemical Name", to = "InChIKey") +#' @param query ID to be converted +#' @param from Type of input ID +#' @param to Desired output ID +#' @return An unordered array with the resulting converted key(s). +#' +#' @examples +#' k <- getCtsKey("benzene", "Chemical Name", "InChIKey") +#' @author Michele Stravs, Eawag +#' @export +getCtsKey <- function(query, from = "Chemical Name", to = "InChIKey") +{ + baseURL <- "https://cts.fiehnlab.ucdavis.edu/service/convert" + url <- paste(baseURL, from, to, query, sep='/') + errorvar <- 0 + currEnvir <- environment() + + ##tryCatch a CTS timeout + ## + tryCatch({ + res <- GET(utils::URLencode(url)) + data <- httr::content(res, type="text", encoding="UTF-8") + }, + error=function(e){ + currEnvir$errorvar <- 1 + } + ) + + if(errorvar){ + warning("CTS seems to be currently unavailable or incapable of interpreting your request") + return(NULL) + } + + if(res$status_code != 200){ + warning(paste("CTS has return code", res$status_code)) + return(NULL) + } + + r <- fromJSON(data) + if(length(r) == 0) + return(NULL) + else + { + # read out the results in simplest form: + results <- unlist(lapply(r, function(row) row$result)) + return(results) + } +} + +#' Select a subset of external IDs from a CTS record. +#' +#' @usage CTS.externalIdSubset(data, database) +#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. +#' @param database The database for which keys should be returned. +#' @return Returns an array of all external identifiers stored in the record for the +#' given database. +#' +#' @examples +#' +#' \dontrun{ +#' # Return all CAS registry numbers stored for benzene. +#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") +#' cas <- CTS.externalIdSubset(data, "CAS") +#' } +#' +#' @author Michele Stravs, Eawag +#' @export +CTS.externalIdSubset <- function(data, database) +{ + select <- which(unlist(lapply(data$externalIds, function(id) + { + id[["name"]] == database + }))) + keyEntries <- data$externalIds[select] + keys <- unlist(lapply(keyEntries, function(e) e[["value"]])) +} + +#' Find all available databases for a CTS record +#' +#' @usage CTS.externalIdTypes(data) +#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. +#' @return Returns an array of all database names for which there are external +#' identifiers stored in the record. +#' +#' @examples +#' +#' \dontrun{ +#' # Return all databases for which the benzene entry has +#' # links in the CTS record. +#' +#' data <- getCTS("UHOVQNZJYSORNB-UHFFFAOYSA-N") +#' databases <- CTS.externalIdTypes(data) +#' } +#' +#' @author Michele Stravs, Eawag +#' @export +CTS.externalIdTypes <- function(data) +{ + unique(unlist(lapply(data$externalIds, function(id) + { + id[["name"]] + }))) +} + +.pubChemOnline <- function(){ + baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, "inchikey", "QEIXBXXKTUNWDK-UHFFFAOYSA-N", "description", "json", sep="/") + + errorvar <- 0 + currEnvir <- environment() + tryCatch({ + res <- GET(utils::URLencode(url)) + ret <- httr::content(res, type="text", encoding="UTF-8") + }, + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + warning("Pubchem is currently offline") + return(FALSE) + } else{ + return(TRUE) + } +} + + + +getPcCHEBI <- function(query, from = "inchikey") +{ + # Get the JSON-Data from Pubchem + baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "synonyms", "json", sep="/") + errorvar <- 0 + currEnvir <- environment() + + tryCatch({ + res <- GET(utils::URLencode(url)) + data <- httr::content(res, type="text", encoding="UTF-8") + }, + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + r <- fromJSON(data) + + # This happens if the InChI key is not found: + if(!is.null(r$Fault)) + return(NA) + + # Find the entries which contain Chebi-links + synonymEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Synonym)))) + synonymList <- r$InformationList$Information[[synonymEntry]]$Synonym + matchChebi <- which(grepl("CHEBI:", synonymList, fixed=TRUE)) + + # It doesn't matter if the db is down or if chebi isn't found, so return NA also + if(length(matchChebi) == 0){ + return (NA) + } else { + return (sapply(matchChebi, function(x) synonymList[[x]])) + } +} + +#' Retrieve the Chemspider ID for a given compound +#' +#' Given an InChIKey, this function queries the chemspider web API to retrieve +#' the Chemspider ID of he compound with that InChIkey. +#' +#' @usage getCSID(query) +#' +#' @param query The InChIKey of the compound +#' @return Returns the chemspide +#' +#' @examples +#' +#' \dontrun{ +#' # Return all CAS registry numbers stored for benzene. +#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") +#' cas <- CTS.externalIdSubset(data, "CAS") +#' } +#' +#' @author Michele Stravs, Eawag +#' @author Erik Mueller, UFZ +#' @export +getCSID <- function(query) +{ + baseURL <- "http://www.chemspider.com/InChI.asmx/InChIKeyToCSID?inchi_key=" + url <- paste0(baseURL, query) + + data <- retrieveDataWithRetry(url = utils::URLencode(url), timeout=8) + if(is.null(data)) { + warning("Chemspider is currently offline") + return(NA) + } + + xml <- xmlParseDoc(data,asText=TRUE) + # the returned XML document contains only the root node called "string" which contains the correct CSID + idNodes <- getNodeSet(xml, "/") + id <- xmlValue(idNodes[[1]]) + return(id) +} + +##This function returns a sensible name for the compound +getPcSynonym <- function (query, from = "inchikey") +{ + # Get the JSON-Data from Pubchem + baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "description", "json", sep="/") + + errorvar <- 0 + currEnvir <- environment() + + tryCatch({ + res <- GET(utils::URLencode(url)) + data <- httr::content(res, type="text", encoding="UTF-8") + }, + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + r <- fromJSON(data) + + # This happens if the InChI key is not found: + if(!is.null(r$Fault)) + return(NA) + + # Find the synonym + + titleEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Title)))) + + titleEntry <- titleEntry[which.min(sapply(titleEntry, function(x)r$InformationList$Information[[x]]$CID))] + + title <- r$InformationList$Information[[titleEntry]]$Title + + if(is.null(title)){ + return(NA) + } else{ + return(title) + } +} + + +##A function to retrieve a IUPAC Name from Pubchem +getPcIUPAC <- function (query, from = "inchikey") +{ + # Get the JSON-Data from Pubchem + baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "record", "json", sep="/") + + errorvar <- 0 + currEnvir <- environment() + + tryCatch({ + res <- GET(utils::URLencode(url)) + data <- httr::content(res, type="text", encoding="UTF-8") + }, + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + r <- fromJSON(data) + + # This happens if the InChI key is not found: + if(!is.null(r$Fault)) + return(NA) + + # Find the IUPAC-Names + if(!is.null(r$PC_Compounds[[1]]$props)){ + IUPACIndex <- which(unlist(lapply(r$PC_Compounds[[1]]$props, function(i) (i$urn$label == "IUPAC Name")))) + if(length(IUPACIndex) > 0){ + # Retrieve all IUPAC-Names + IUPACEntries <- lapply(IUPACIndex, function(x) r$PC_Compounds[[1]]$props[[x]]) + if(!is.null(IUPACEntries)){ + # Is there a preferred IUPAC-Name? If yes, retrieve that + PrefIUPAC <- which(unlist(lapply(IUPACEntries, function(x) x$urn$name == "Preferred"))) + } else{return(NA)} + } else{return(NA)} + } else{return(NA)} + + + if(length(PrefIUPAC) == 1){ + return(IUPACEntries[[PrefIUPAC]]$value$sval) + } else{ + # Else it doesn't matter which + return(IUPACEntries[[1]]$value$sval) + } +} + +getPcInchiKey <- function(query, from = "smiles"){ + # Get the JSON-Data from Pubchem + baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "json", sep="/") + errorvar <- 0 + currEnvir <- environment() + + tryCatch({ + res <- httr::GET(utils::URLencode(url)) + data <- httr::content(res, type="text", encoding="UTF-8") + }, + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + r <- rjson::fromJSON(data) + + # This happens if the InChI key is not found: + if(!is.null(r$Fault)) + return(NA) + + # Find the entries which contain Chebi-links + if(!is.null(r$PC_Compounds[[1]]$props)){ + INKEYindex <- which(sapply(r$PC_Compounds[[1]]$props, function(x) x$urn$label) == "InChIKey") + if(length(INKEYindex) > 0){ + return(r$PC_Compounds[[1]]$props[[INKEYindex]]$value$sval) + } else{return(NA)} + } else{return(NA)} + + +} + +getPcSDF <- function(query, from = "smiles"){ + baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "sdf", sep="/") + + errorvar <- 0 + currEnvir <- environment() + + tryCatch({ + res <- GET(utils::URLencode(url)) + data <- httr::content(res, type="text", encoding="UTF-8") + }, + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + molEnd <- regexpr(data,pattern="M END",fixed=TRUE)+5 + data <- c(strsplit(substring(data,1,molEnd),"\n")[[1]],"$$$$") + return(data) +} + diff --git a/man/flatten.Rd b/man/flatten.Rd index 0069371..2d60a06 100755 --- a/man/flatten.Rd +++ b/man/flatten.Rd @@ -17,7 +17,7 @@ readMbdata(row) infolist.} } \value{ -\code{flatten} returns a matrix (not a data frame) to be written to +\code{flatten} returns a tibble (not a data frame or matrix) to be written to CSV. \code{readMbdata} returns a list of type \code{list(id= \var{compoundID},