Skip to content

Commit

Permalink
Tidy flatten
Browse files Browse the repository at this point in the history
  • Loading branch information
Tobias Schulze committed Aug 12, 2023
1 parent 3cf1d45 commit 0ff83bc
Show file tree
Hide file tree
Showing 4 changed files with 623 additions and 610 deletions.
6 changes: 3 additions & 3 deletions R/buildRecord.R
Original file line number Diff line number Diff line change
Expand Up @@ -284,10 +284,10 @@ setMethod("buildRecord", "RmbSpectrum2", function(o, ..., cpd = NULL, mbdata = l
# Here is the right place to fix the name of the INTERNAL ID field.
if(!is.null(getOption("RMassBank")$annotations$internal_id_fieldname))
{
id.col <- which(names(mbdata[["COMMENT"]]) == "ID")
if(length(id.col) > 0)
id_col <- which(names(mbdata[["COMMENT"]]) == "ID")
if(length(id_col) > 0)
{
names(mbdata[["COMMENT"]])[[id.col]] <-
names(mbdata[["COMMENT"]])[[id_col]] <-
getOption("RMassBank")$annotations$internal_id_fieldname
}
}
Expand Down
85 changes: 49 additions & 36 deletions R/createMassBank.R
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,14 @@ loadInfolist <- function(mb, fileName)
# dbname_e will be dropped because of the select= in the subset below.
}

if("COMMENT.EAWAG_UCHEM_ID" %in% colnames(mbdata_new)) {
colnames(mbdata_new)[[which(colnames(mbdata_new)== "COMMENT.EAWAG_UCHEM_ID")]] <-
"COMMENT.ID"
if("COMMENT_EAWAG_UCHEM_ID" %in% colnames(mbdata_new)) {
colnames(mbdata_new)[[which(colnames(mbdata_new) == "COMMENT_EAWAG_UCHEM_ID")]] <-
"COMMENT_ID"
}

# use only the columns present in mbdata_archive, no other columns added in excel
col_names <- colnames(mb@mbdata_archive)
comment_colnames <- colnames(mbdata_new)[grepl(x = colnames(mbdata_new), pattern = "^COMMENT\\.(?!CONFIDENCE)(?!ID)", perl = TRUE)]
comment_colnames <- colnames(mbdata_new)[grepl(x = colnames(mbdata_new), pattern = "^COMMENT\\_(?!CONFIDENCE)(?!ID)", perl = TRUE)]
col_names <- c(col_names, comment_colnames)

## The read infolists might not have all required / expected columns
Expand Down Expand Up @@ -138,8 +138,8 @@ resetInfolists <- function(mb)
{
mb@mbdata_archive <-
structure(list(id = integer(0), dbcas = character(0),
dbname = character(0), dataused = character(0), COMMENT.CONFIDENCE = character(0),
COMMENT.ID = integer(0), `CH$NAME1` = character(0),
dbname = character(0), dataused = character(0), COMMENT_CONFIDENCE = character(0),
COMMENT_ID = integer(0), `CH$NAME1` = character(0),
`CH$NAME2` = character(0), `CH$NAME3` = character(0), `CH$NAME4` = character(0),
`CH$NAME5` = character(0), `CH$COMPOUND_CLASS` = character(0),
`CH$FORMULA` = character(0), `CH$EXACT_MASS` = numeric(0),` CH$SMILES` = character(0),
Expand All @@ -149,7 +149,7 @@ resetInfolists <- function(mb)
`CH$LINK_CHEMSPIDER` = integer(0), `CH$LINK_COMPTOX` = character(0),
AUTHORS = character(0), COPYRIGHT = character(0)
), .Names = c("id", "dbcas",
"dbname", "dataused", "COMMENT.CONFIDENCE", "COMMENT.ID",
"dbname", "dataused", "COMMENT_CONFIDENCE", "COMMENT_ID",
"CH$NAME1", "CH$NAME2", "CH$NAME3", "CH$NAME4", "CH$NAME5", "CH$COMPOUND_CLASS", "CH$FORMULA",
"CH$EXACT_MASS", "CH$SMILES", "CH$IUPAC", "CH$LINK_CAS", "CH$LINK_CHEBI",
"CH$LINK_HMDB", "CH$LINK_KEGG", "CH$LINK_LIPIDMAPS", "CH$LINK_PUBCHEM",
Expand Down Expand Up @@ -254,7 +254,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c
rmb_log_info("mbWorkflow: Step 2. Export infolist (if required)")
if(length(mb@mbdata)>0)
{
mbdata <- tibble::as_tibble(flatten(mb@mbdata))
mbdata <- flatten(mb@mbdata)
readr::write_csv(x = mbdata, file = infolist_path, col_names = TRUE, na = "", quote = "needed")
rmb_log_info(paste("The file", infolist_path, "was generated with new compound information. Please check and edit the table, and add it to your infolist folder."))
return(mb)
Expand Down Expand Up @@ -585,14 +585,19 @@ gatherData <- function(id)

if(usebabel){
cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey')
inchikey_split <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE)
} else{
inchikey <- getCactus(smiles, 'stdinchikey')
inchikey_split <- system(cmdinchikey, intern = TRUE, input = smiles, ignore.stderr = TRUE)
} else {
inchi_key <- getCactus(identifier = smiles, representation = "stdinchikey")

if(is.na(inchi_key)) {
inchi_key <- getPcInchiKey(query = smiles, from = "smiles")
}

if(!is.na(inchikey)){
##Split the "InChiKey=" part off the key
inchikey_split <- strsplit(inchikey, "=", fixed=TRUE)[[1]][[2]]
} else{
inchikey_split <- getPcInchiKey(smiles)
inchikey_split <- strsplit(inchi_key, "=", fixed = TRUE)[[1]][[2]]
} else {
inchikey_split <- getPcInchiKey(query = smiles, from = "smiles")
}
}

Expand Down Expand Up @@ -1120,7 +1125,7 @@ gatherDataUnknown <- function(id, mode, retrieval){
#' \code{\link{gatherData}}.
#' @param row One row of MassBank compound information retrieved from an
#' infolist.
#' @return \code{flatten} returns a matrix (not a data frame) to be written to
#' @return \code{flatten} returns a tibble (not a data frame or matrix) to be written to
#' CSV.
#'
#' \code{readMbdata} returns a list of type \code{list(id= \var{compoundID},
Expand All @@ -1146,19 +1151,19 @@ flatten <- function(mbdata)
.checkMbSettings()

colNames <- names(unlist(mbdata[[1]]))
commentNames <- colNames[grepl(x = colNames, pattern = "^COMMENT\\.")]
commentNames <- colNames[grepl(x = colNames, pattern = "^COMMENT\\_")]

colList <- c(
"id",
"dbcas",
"dbname",
"dataused",
commentNames,
#"COMMENT.CONFIDENCE",
#"COMMENT_CONFIDENCE",
# Note: The field name of the internal id field is replaced with the real name
# at "compilation" time. Therefore, functions DOWNSTREAM from compileRecord()
# must use the full name including the info from options("RMassBank").
#"COMMENT.ID",
#"COMMENT_ID",
"CH$NAME1",
"CH$NAME2",
"CH$NAME3",
Expand All @@ -1177,27 +1182,35 @@ flatten <- function(mbdata)
"CH$LINK_PUBCHEM",
"CH$LINK_INCHIKEY",
"CH$LINK_CHEMSPIDER",
"CH$LINK_COMPTOX"
"CH$LINK_COMPTOX"
)
# make an empty data frame with the right length
rows <- length(mbdata)
cols <- length(colList)
mbframe <- matrix(data=NA, nrow=rows, ncol=cols)
colnames(mbframe) <- colList

mbtbl <- tibble::tibble(!!!colList, .rows = 0, .name_repair = ~ colList)


#mbframe <- matrix(data = NA, nrow = rows, ncol = cols)
#colnames(mbframe) <- colList
#browser()
for(row in 1:rows)
{
for(i in 1:rows) {
# fill in all the data into the dataframe: all columns which
# a) exist in the target dataframe and b) exist in the (unlisted) MB record
# are written into the dataframe.
data <- unlist(mbdata[[row]])
data <- unlist(mbdata[[i]], use.names = TRUE)
names(data) <- gsub("\\.", "_", names(data))
# bugfix for the case of only one name
if(!("CH$NAME1" %in% names(data)))
data[["CH$NAME1"]] <- data[["CH$NAME"]]
datacols <- intersect(colList, names(data))
mbframe[row,datacols] <- data[datacols]
if(!("CH$NAME1" %in% names(data))) {
data[["CH$NAME1"]] <- data[["CH$NAME"]]
}
datacols <- intersect(colList, names(data))

mbtbl <- mbtbl |> dplyr::bind_rows(data[datacols])

}
return(mbframe)

return(mbtbl)

}

Expand Down Expand Up @@ -1231,8 +1244,8 @@ readMbdata <- function(row)
# This is not very flexible, as you can see...
colList <- c(
commentNames,
#"COMMENT.CONFIDENCE",
#"COMMENT.ID",
#"COMMENT_CONFIDENCE",
#"COMMENT_ID",
"CH$NAME1",
"CH$NAME2",
"CH$NAME3",
Expand All @@ -1253,10 +1266,10 @@ readMbdata <- function(row)
"CH$LINK_CHEMSPIDER",
"CH$LINK_COMPTOX")
mbdata[["COMMENT"]] = list()
#mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT.CONFIDENCE"]]
#mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT_CONFIDENCE"]]
# Again, our ID field.
#mbdata[["COMMENT"]][["ID"]] <- row[["COMMENT.ID"]]
mbdata[["COMMENT"]][gsub(x = commentNames, pattern = "^COMMENT\\.", replacement = "")] <- row[commentNames]
#mbdata[["COMMENT"]][["ID"]] <- row[["COMMENT_D"]]
mbdata[["COMMENT"]][gsub(x = commentNames, pattern = "^COMMENT\\_", replacement = "")] <- row[commentNames]

names = c(row[["CH$NAME1"]], row[["CH$NAME2"]], row[["CH$NAME3"]], row[["CH$NAME4"]], row[["CH$NAME5"]])
names = names[which(!is.na(names))]
Expand All @@ -1283,8 +1296,8 @@ readMbdata <- function(row)
mbdata[["CH$LINK"]] <- link

## SP$SAMPLE
if(all(nchar(row[["SP.SAMPLE"]]) > 0, row[["SP.SAMPLE"]] != "NA", !is.na(row[["SP.SAMPLE"]]), na.rm = TRUE))
mbdata[['SP$SAMPLE']] <- row[["SP.SAMPLE"]]
if(all(nchar(row[["SP_SAMPLE"]]) > 0, row[["SP_SAMPLE"]] != "NA", !is.na(row[["SP_SAMPLE"]]), na.rm = TRUE))
mbdata[['SP$SAMPLE']] <- row[["SP_SAMPLE"]]

if(!is.na(row[["AUTHORS"]]))
mbdata[["AUTHORS"]] = row[["AUTHORS"]]
Expand Down
Loading

0 comments on commit 0ff83bc

Please sign in to comment.