diff --git a/.Rbuildignore b/.Rbuildignore index 62e6831..4ed939c 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,3 +8,4 @@ man-roxygen ^docs$ ^pkgdown$ ^\.github$ +^vignettes/articles$ diff --git a/DESCRIPTION b/DESCRIPTION index e921d9d..cb4b2f0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,16 +1,15 @@ Package: arcMS Type: Package Title: MS converter from UNIFI to Parquet and HDF5 formats -Version: 1.1.0 +Version: 1.1.1 Authors@R: c( person("Julien", "Le Roux", email = "julien.le-roux@u-pec.fr", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-0245-8536")), person("Julien", "Sade", email = "julien.sade@u-pec.fr", role = "aut") ) Maintainer: Julien Le Roux -Description: Functions to connect to Waters UNIFI API and convert data to parquet or HDF5 format. +Description: Functions to connect to Waters UNIFI API and convert data to Parquet or HDF5 format. License: MIT + file LICENSE Encoding: UTF-8 -LazyData: true RoxygenNote: 7.3.1 Suggests: testthat (>= 3.0.0), @@ -22,7 +21,10 @@ Suggests: shiny, bs4Dash, knitr, - rmarkdown + rmarkdown, + duckdb, + plotly, + fontawesome Config/testthat/edition: 3 Imports: methods, diff --git a/NAMESPACE b/NAMESPACE index b0c1ae2..98b6a06 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,18 +14,18 @@ export(get_sample_infos) export(get_samples_list) export(run_app) export(sample_dataset) -export(sample_dataset_ondisk) export(sample_infos) export(save_one_sample_data) export(store_unifi_api_token) exportClasses(connection_params) exportClasses(sample_dataset) -exportClasses(sample_dataset_ondisk) exportClasses(sample_infos) +exportMethods(add_sample_id) exportMethods(connection_apihosturl) exportMethods(connection_token) exportMethods(get_analysis_name) exportMethods(get_sample_data) +exportMethods(get_sample_id) exportMethods(get_sample_metadata) exportMethods(get_sample_metadata_json) exportMethods(get_sample_name) @@ -38,10 +38,12 @@ import(methods) import(withr) importFrom(RProtoBuf,readProtoFiles) importFrom(Rcpp,evalCpp) +importFrom(arrow,open_dataset) importFrom(arrow,read_parquet) importFrom(arrow,write_parquet) importFrom(checkmate,makeAssertion) importFrom(checkmate,vname) +importFrom(dplyr,collect) importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(future,multisession) diff --git a/R/LICENSE b/R/LICENSE deleted file mode 100644 index bc9a584..0000000 --- a/R/LICENSE +++ /dev/null @@ -1,2 +0,0 @@ -YEAR: 2023 -COPYRIGHT HOLDER: arcMS authors diff --git a/R/LICENSE.md b/R/LICENSE.md deleted file mode 100644 index 847b1eb..0000000 --- a/R/LICENSE.md +++ /dev/null @@ -1,21 +0,0 @@ -# MIT License - -Copyright (c) 2023 arcMS authors - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/R/README.Rmd b/R/README.Rmd deleted file mode 100644 index 1d528e2..0000000 --- a/R/README.Rmd +++ /dev/null @@ -1,159 +0,0 @@ ---- -output: github_document ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - fig.path = "man/figures/README-", - out.width = "100%" -) -``` - -# 🏹 arcMS - - - - - -`arcMS` can convert (HD)MSE data acquired with Waters UNIFI to tabular format for use in R or Python, with a small filesize when saved on disk. It is compatible with data containing ion mobility (HDMSE) or not (MSE). - -Two output data file formats can be obtained: - -- the [Apache Parquet](https://parquet.apache.org/) format for minimal filesize and fast access. Two files are produced: one for MS data, one for metadata. - -- the [HDF5](https://www.hdfgroup.org/solutions/hdf5/) format with all data and metadata in one file, fast access but larger filesize. - -`arcMS` stands for *accessible*, *rapid* and *compact*, and is also based on the french word *arc*, which means *bow,* to emphasize that it is compatible with the [Apache Arrow library](https://arrow.apache.org/). - -A companion app (R/Shiny app) is provided at for fast visualization of the converted data (Parquet format) as 2D plots, TIC, BPI or EIC chromatograms... - -## :arrow_down: Installation - -You can install `arcMS` in R with the following command: - -```{r eval=FALSE} -install.packages("pak") -pak::pak("leesulab/arcMS") -``` - -To use the HDF5 format, the `rhdf5` package needs to be installed: - -```{r eval=FALSE} -pak::pak("rhdf5") -``` - -## 🚀 Usage - -First load the package: - -```{r eval=FALSE} -library("arcMS") -``` - -```{r include=FALSE} -library("arcMS") -``` - -Then create connection parameters to the UNIFI API (retrieve token). See `vignette("api-configuration")` to know how to configure the API and register a client app. - -```{r eval=FALSE} -con = create_connection_params(apihosturl = "http://localhost:50034/unifi/v1", identityurl = "http://localhost:50333/identity/connect/token") -``` - -If `arcMS` and the `R` session are run from another computer than where the UNIFI API is installed, replace `localhost` by the IP address of the UNIFI API. - -```{r eval=FALSE} -con = create_connection_params(apihosturl = "http://192.0.2.0:50034/unifi/v1", identityurl = "http://192.0.2.0:50333/identity/connect/token") -``` - -```{r include=FALSE} -#con = create_connection_params(apihosturl = "http://10.12.3.154:50034/unifi/v1", identityurl = "http://10.12.3.154:50333/identity/connect/token" ) - -con = create_connection_params(apihosturl = "http://localhost:50034/unifi/v1", identityurl = "http://localhost:50333/identity/connect/token") -``` - -Now these connection parameters will be used to access the UNIFI folders. The following function will show the list of folders and their IDs (e.g. `abe9c297-821e-4152-854a-17c73c9ff68c` in the example below). - -```{r eval=FALSE} -folders = folders_search() -folders -``` - -```{r echo=FALSE} -folders = folders_search(con) -folders[3:4,] -``` - -With a folder ID, we can access the list of Analysis items in the folder: - -```{r eval=FALSE} -ana = analysis_search("abe9c297-821e-4152-854a-17c73c9ff68c") -ana -``` - -```{r eval=FALSE, include=FALSE} -ana = analysis_search("abe9c297-821e-4152-854a-17c73c9ff68c") -ana[4:6,] -``` - -Finally, with an Analysis ID, we can get the list of samples (injections) acquired in this Analysis: - -```{r eval=FALSE} -samples = get_samples_list("e236bf99-31cd-44ae-a4e7-74915697df65") -samples -``` - -```{r eval=FALSE, include=FALSE} -samples = get_samples_list("e236bf99-31cd-44ae-a4e7-74915697df65") -samples[2:5,] -``` - -Once we get a sample ID, we can use it to download the sample data, using the `future` framework for parallel processing: - -```{r eval=FALSE} -library(future) -plan(multisession) -convert_one_sample_data(sample_id = "0134efbf-c75a-411b-842a-4f35e2b76347") -``` - -This command will get the sample name (`sample_name`) and its parent analysis (`analysis_name`), create a folder named `analysis_name` in the working directory and save the sample data with the name `sample_name.parquet` and its metadata with the name `sample_name-metadata.json` (metadata is also saved in the parquet file). - -With an Analysis ID, we can convert and save all samples from the chosen Analysis: - -```{r eval=FALSE} -convert_all_samples_data(analysis_id = "e236bf99-31cd-44ae-a4e7-74915697df65") -``` - -To use the HDF5 format instead of Parquet, the format argument can be used as below: - -```{r eval=FALSE} -convert_one_sample_data(sample_id = "0134efbf-c75a-411b-842a-4f35e2b76347", format = "hdf5") - -convert_all_samples_data(analysis_id = "e236bf99-31cd-44ae-a4e7-74915697df65", format = "hdf5") -``` - -This will save the samples data and metadata in the same `file.h5` file. - -Other functions are available to only collect the data from the API to an R object, and then to save this R object to a Parquet file (see `vignette("collect-save-functions"))`. CCS values can also be retrieved in addition to bin index and drift time values, see `vignette("get-ccs-values").` - -Parquet or HDF5 files can be opened easily in `R` with the `arrow` or `rhdf5` packages. Parquet files contain both low and high energy spectra (HDMSe), and HDF5 files contain low energy in the "ms1" dataset, high energy in the "ms2" dataset, and metadata in the "metadata" dataset. The `fromJSON` function from `jsonlite` package will import the metadata json file (associated with the Parquet file) as a list of dataframes. - -```{r eval=FALSE} -sampleparquet = arrow::read_parquet("sample.parquet") -metadataparquet = jsonlite::fromJSON("sample-metadata.json") - -samplems1hdf5 = rhdf5::h5read("sample.h5", name = "ms1") -samplems2hdf5 = rhdf5::h5read("sample.h5", name = "ms2") -samplemetadatahdf5 = rhdf5::h5read("sample.h5", name = "samplemetadata") -spectrummetadatahdf5 = rhdf5::h5read("sample.h5", name = "spectrummetadata") -``` - -## ✨ Shiny App - -A Shiny application is available to use the package easily. To run the app, just use the following command (it might need to install a few additional packages): - -```{r eval=FALSE} -run_app() -``` diff --git a/R/convert_bin_to_ccs.R b/R/convert_bin_to_ccs.R index f172637..912a098 100644 --- a/R/convert_bin_to_ccs.R +++ b/R/convert_bin_to_ccs.R @@ -1,4 +1,5 @@ #' Convert bin value to CCS (Collision Cross Section) value +#' #' This function converts bin values to CCS values for the given sample. #' This conversion should be performed for a limited number of detected peaks #' and not on raw data to avoid too many requests and useless CCS calculations @@ -20,11 +21,10 @@ convert_bin_to_ccs <- function(sample_dataset, connection_params = NULL) { # Check if sample_id is accessible - sample_metadata = get_sample_metadata(sample_dataset) - if (!"id" %in% names(sample_metadata)) { - stop("Sample ID is missing.") + sample_id = get_sample_id(sample_dataset) + if (is.null(sample_id)) { + stop("Sample ID is missing in sample_metadata. \nPlease add the sample ID to the sample_dataset object with the add_sample_id() method if needed.") } - sample_id <- sample_metadata$id # Check connection parameters if (is.null(connection_params)) { @@ -39,6 +39,14 @@ convert_bin_to_ccs <- function(sample_dataset, connection_params = NULL) { } sample_data = get_sample_data(sample_dataset) + # check if sample_data is Arrow pointer or data.table in RAM + if (inherits(sample_data, "FileSystemDataset")) { + sample_data = sample_data |> collect() + attr(sample_data, "sample_metadata") = NULL + attr(sample_data, "spectrum_metadata") = NULL + attr(sample_data, "sample_metadata_json") = NULL + attr(sample_data, "spectrum_metadata_json") = NULL + } # Change rt column name if necessary if ("retention_time" %in% names(sample_data)) { @@ -61,21 +69,23 @@ convert_bin_to_ccs <- function(sample_dataset, connection_params = NULL) { charges = rep(1, nrow(sample_data)), retentiontimes = sample_data$rt )) + message(glue::glue("Requesting conversion of bin values to CCS to the UNIFI API...")) # Make API call - response <- httr::POST( - url = sampleUrl, - body = body, - httr::add_headers( - "Content-Type" = "application/json", - "Authorization" = paste("Bearer", token) + response <- httr::POST( + url = sampleUrl, + body = body, + httr::add_headers( + "Content-Type" = "application/json", + "Authorization" = paste("Bearer", token) + ) ) - ) # Check the response status if (httr::status_code(response) != 200) { stop("Error in API request: ", httr::content(response, "text", encoding = "UTF-8")) } + message(glue::glue("Conversion done.")) # Update data with CCS values ccs_values <- jsonlite::fromJSON(httr::content(response, "text", encoding = "UTF-8")) diff --git a/R/generics.R b/R/generics.R index 10f1c33..f3eb42b 100644 --- a/R/generics.R +++ b/R/generics.R @@ -2,8 +2,10 @@ setGeneric("connection_apihosturl", function(obj, ...) standardGeneric("connecti setGeneric("connection_token", function(obj, ...) standardGeneric("connection_token")) setGeneric("get_sample_metadata", function(obj, ...) standardGeneric("get_sample_metadata")) setGeneric("get_sample_name", function(obj, ...) standardGeneric("get_sample_name")) +setGeneric("get_sample_id", function(obj, ...) standardGeneric("get_sample_id")) setGeneric("get_analysis_name", function(obj, ...) standardGeneric("get_analysis_name")) setGeneric("get_sample_metadata_json", function(obj, ...) standardGeneric("get_sample_metadata_json")) setGeneric("get_spectrum_metadata", function(obj, ...) standardGeneric("get_spectrum_metadata")) setGeneric("get_spectrum_metadata_json", function(obj, ...) standardGeneric("get_spectrum_metadata_json")) setGeneric("get_sample_data", function(obj, ...) standardGeneric("get_sample_data")) +setGeneric("add_sample_id", function(obj, id, ...) standardGeneric("add_sample_id")) diff --git a/R/main.R b/R/main.R index f6ac94d..eb2af05 100644 --- a/R/main.R +++ b/R/main.R @@ -7,7 +7,7 @@ #' @importFrom utils head tail modifyList setTxtProgressBar txtProgressBar write.csv write.table read.csv data getFromNamespace #' @importFrom stats ave #' @importFrom magrittr %>% -#' @importFrom dplyr mutate select +#' @importFrom dplyr mutate select collect #' @importFrom tidytable uncount unnest #' @importFrom RProtoBuf readProtoFiles #' @importFrom httr GET add_headers content POST @@ -16,7 +16,7 @@ #' @importFrom future plan multisession #' @importFrom future.apply future_lapply #' @importFrom progressr progressor with_progress withProgressShiny -#' @importFrom arrow write_parquet read_parquet +#' @importFrom arrow write_parquet read_parquet open_dataset #' @importFrom rhdf5 h5write h5createFile NULL # need this for doc generation @@ -47,5 +47,6 @@ NULL #' "_PACKAGE" NULL - +setOldClass("FileSystemDataset") setClassUnion("dataframeOrDatatable", c("data.frame", "data.table")) +setClassUnion("dataframeOrDatatableOrArrowdataset", c("data.frame", "data.table", "FileSystemDataset")) diff --git a/R/sample_dataset.R b/R/sample_dataset.R index 4e30685..302494c 100644 --- a/R/sample_dataset.R +++ b/R/sample_dataset.R @@ -4,12 +4,14 @@ NULL #' Class containing a sample data and metadata #' -#' Contains sample data, metadata and spectrum metadata in table and json formats. +#' Contains sample data (as data.table in RAM or as pointer to Parquet file), +#' metadata and spectrum metadata in table and json formats. #' #' Objects for this class are returned by \code{\link{collect_one_sample_data}} #' and \code{\link{create_sample_dataset}}. #' -#' @slot sample_data Contains a \code{datatable} with the sample data. +#' @slot sample_data Contains a \code{datatable} with the sample data, or +#' an \code{Arrow Dataset} R6 object pointing to the Parquet file data. #' @slot sample_metadata Contains a \code{datatable} with the sample metadata. #' @slot spectrum_metadata Contains a \code{datatable} with the spectrum metadata. #' @slot sample_metadata_json Contains a \code{character} with the sample metadata. @@ -22,7 +24,7 @@ NULL #' #' @export sample_dataset <- setClass("sample_dataset", - slots = c(sample_data = "dataframeOrDatatable"), + slots = c(sample_data = "dataframeOrDatatableOrArrowdataset"), contains = "sample_infos") setMethod("initialize", signature = "sample_dataset", definition = function(.Object, ...) @@ -45,7 +47,7 @@ setMethod("get_sample_data", "sample_dataset", function(obj) obj@sample_data) #' The function creates a sample dataset object from data imported from a Parquet file. #'' #' @param file A character file name or URI of Parquet file. -#' @param method Whether to import the data in RAM or keep it on-disk +#' @param method Whether to import the data in RAM ("inram") or keep it on-disk ("ondisk") #' #' @return A \code{\link{sample_dataset}} object, containing the sample data, #' sample metadata and spectrum metadata datatables. @@ -53,8 +55,14 @@ setMethod("get_sample_data", "sample_dataset", function(obj) obj@sample_data) #' @export create_sample_dataset <- function(file, method = "inram"){ - - data = read_parquet(file) + if (!method %in% c('inram', 'ondisk')) { + stop("The method must be either 'inram' or 'ondisk'") + } + if (method == "inram") { + data = read_parquet(file) + } else { + data = open_dataset(file) + } sample_metadata = spectrum_metadata = data.table(NULL) sample_metadata_json = spectrum_metadata_json = character(0) @@ -62,18 +70,26 @@ create_sample_dataset <- function(file, method = "inram"){ if (!is.null(attr(data, "sample_metadata"))) { sample_metadata = attributes(data)$sample_metadata attr(data, "sample_metadata") = NULL + } else if (!is.null(data$metadata$sample_metadata)) { + sample_metadata = jsonlite::fromJSON(data$metadata$sample_metadata) } if (!is.null(attr(data, "spectrum_metadata"))) { spectrum_metadata = attributes(data)$spectrum_metadata attr(data, "spectrum_metadata") = NULL + } else if (!is.null(data$metadata$spectrum_metadata)) { + spectrum_metadata = jsonlite::fromJSON(data$metadata$spectrum_metadata) } if (!is.null(attr(data, "sample_metadata_json"))) { sample_metadata_json = as.character(attributes(data)$sample_metadata_json) attr(data, "sample_metadata_json") = NULL + } else if (!is.null(data$metadata$sample_metadata)) { + sample_metadata_json = data$metadata$sample_metadata } if (!is.null(attr(data, "spectrum_metadata_json"))) { spectrum_metadata_json = as.character(attributes(data)$spectrum_metadata_json) attr(data, "spectrum_metadata_json") = NULL + } else if (!is.null(data$metadata$spectrum_metadata)) { + spectrum_metadata_json = data$metadata$spectrum_metadata } collecteddata <- sample_dataset( @@ -87,36 +103,3 @@ create_sample_dataset <- function(file, method = "inram"){ return(collecteddata) } - -#' Class containing a sample data as pointer to Parquet file, and metadata -#' -#' Contains sample data as a pointer to a Parquet file, and sample metadata -#' and spectrum metadata in table and json formats. -#' -#' Objects for this class can be created by \code{\link{create_sample_dataset}} -#' with the ondisk method. -#' -#' @slot sample_data Contains an \code{Arrow Dataset} R6 object pointing to the Parquet file. -#' @slot sample_metadata Contains a \code{datatable} with the sample metadata. -#' @slot spectrum_metadata Contains a \code{datatable} with the spectrum metadata. -#' @slot sample_metadata_json Contains a \code{character} with the sample metadata. -#' @slot spectrum_metadata_json Contains a \code{character} with the spectrum metadata. -#' -#' @section Use the \code{\link{collect_one_sample_data}} to: -#' store the sample data, metadata and spectrum metadata, in table and json formats. -#' -#' @param obj The \code{\link{sample_dataset}} object to access. -#' -#' @export - -sample_dataset_ondisk <- setClass("sample_dataset_ondisk", - slots = c(sample_data = c("FileSystemDataset")), - contains = "sample_infos") -# initialize method during object instantiation -setMethod("initialize", signature = "sample_dataset_ondisk", - definition = function(.Object, sample_data, ...) - { - .Object@sample_data <- sample_data - .Object <- callNextMethod(.Object, ...) - return(.Object) - } ) diff --git a/R/sample_infos.R b/R/sample_infos.R index c8a483f..6b7ec71 100644 --- a/R/sample_infos.R +++ b/R/sample_infos.R @@ -44,11 +44,17 @@ setMethod("initialize", signature = "sample_infos", setMethod("get_sample_metadata", "sample_infos", function(obj) obj@sample_metadata) #' @describeIn sample_infos Accessor method to obtain the sample name. -#' @return \code{get_sample_infos} returns a character object containing the sample name. +#' @return \code{get_sample_name} returns a character object containing the sample name. #' @aliases get_sample_name #' @export setMethod("get_sample_name", "sample_infos", function(obj) obj@sample_metadata$sampleName) +#' @describeIn sample_infos Accessor method to obtain the sample id. +#' @return \code{get_sample_id} returns a character object containing the sample id. +#' @aliases get_sample_id +#' @export +setMethod("get_sample_id", "sample_infos", function(obj) obj@sample_metadata$id) + #' @describeIn sample_infos Accessor method to obtain the analysis name. #' @return \code{get_analysis_name} returns a character object containing the analysis name. #' @aliases get_analysis_name @@ -85,6 +91,21 @@ setMethod("get_spectrum_metadata_json", "sample_infos", function(obj) { } }) +#' @describeIn sample_infos Method to add a sample ID to the sample_infos object. +#' @param id The identifier of the sample result for which spectrum information is to be retrieved. +#' @return \code{add_sample_id} returns a character object containing the sample id. +#' @aliases add_sample_id +#' @export +setMethod("add_sample_id", "sample_infos", function(obj, id) { + if (!is.character(id)) { + stop("The sample ID must be a character string.") + } else { + obj@sample_metadata$id <- id + } + if (validObject(obj)) + return(obj) + }) + #' Retrieve Sample Information from UNIFI API with a sample id #' #' This function retrieves sample metadata and spectrum information from the UNIFI API for a specified sample result using the provided connection parameters. diff --git a/_pkgdown.yml b/_pkgdown.yml index 1a9e367..23297ce 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -2,4 +2,13 @@ url: https://leesulab.github.io/arcMS/ template: bootstrap: 5 bootswatch: cerulean - +articles: +- title: Articles + navbar: ~ + contents: + - api-configuration + - collect-save-functions + - get-ccs-values + - open-files + - articles/data-filtration-tutorial + - distant-query diff --git a/man/convert_bin_to_ccs.Rd b/man/convert_bin_to_ccs.Rd index 451b4dd..ec467eb 100644 --- a/man/convert_bin_to_ccs.Rd +++ b/man/convert_bin_to_ccs.Rd @@ -2,13 +2,7 @@ % Please edit documentation in R/convert_bin_to_ccs.R \name{convert_bin_to_ccs} \alias{convert_bin_to_ccs} -\title{Convert bin value to CCS (Collision Cross Section) value -This function converts bin values to CCS values for the given sample. -This conversion should be performed for a limited number of detected peaks -and not on raw data to avoid too many requests and useless CCS calculations -for all m/z peaks and fragments. -Peak lists can be obtained after peak detection, for example using the DEIMoS Python library. -For more information on DEIMoS, see \url{https://deimos.readthedocs.io/en/latest/}.} +\title{Convert bin value to CCS (Collision Cross Section) value} \usage{ convert_bin_to_ccs(sample_dataset, connection_params = NULL) } @@ -25,7 +19,6 @@ The `id` of the sample should be present in the `sample_metadata` slot of the sa A sample_dataset object including the original data and an additional column `CCS` containing the CCS values. } \description{ -Convert bin value to CCS (Collision Cross Section) value This function converts bin values to CCS values for the given sample. This conversion should be performed for a limited number of detected peaks and not on raw data to avoid too many requests and useless CCS calculations diff --git a/man/create_sample_dataset.Rd b/man/create_sample_dataset.Rd index 8483f3f..9f028c7 100644 --- a/man/create_sample_dataset.Rd +++ b/man/create_sample_dataset.Rd @@ -9,7 +9,7 @@ create_sample_dataset(file, method = "inram") \arguments{ \item{file}{A character file name or URI of Parquet file.} -\item{method}{Whether to import the data in RAM or keep it on-disk} +\item{method}{Whether to import the data in RAM ("inram") or keep it on-disk ("ondisk")} } \value{ A \code{\link{sample_dataset}} object, containing the sample data, diff --git a/man/sample_dataset-class.Rd b/man/sample_dataset-class.Rd index 95abe1f..24520d6 100644 --- a/man/sample_dataset-class.Rd +++ b/man/sample_dataset-class.Rd @@ -17,7 +17,8 @@ \code{get_sample_data} returns a data.table object containing the sample data. } \description{ -Contains sample data, metadata and spectrum metadata in table and json formats. +Contains sample data (as data.table in RAM or as pointer to Parquet file), +metadata and spectrum metadata in table and json formats. } \details{ Objects for this class are returned by \code{\link{collect_one_sample_data}} @@ -31,7 +32,8 @@ and \code{\link{create_sample_dataset}}. \section{Slots}{ \describe{ -\item{\code{sample_data}}{Contains a \code{datatable} with the sample data.} +\item{\code{sample_data}}{Contains a \code{datatable} with the sample data, or +an \code{Arrow Dataset} R6 object pointing to the Parquet file data.} \item{\code{sample_metadata}}{Contains a \code{datatable} with the sample metadata.} diff --git a/man/sample_dataset_ondisk-class.Rd b/man/sample_dataset_ondisk-class.Rd deleted file mode 100644 index 137f817..0000000 --- a/man/sample_dataset_ondisk-class.Rd +++ /dev/null @@ -1,37 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sample_dataset.R -\docType{class} -\name{sample_dataset_ondisk-class} -\alias{sample_dataset_ondisk-class} -\alias{sample_dataset_ondisk} -\title{Class containing a sample data as pointer to Parquet file, and metadata} -\arguments{ -\item{obj}{The \code{\link{sample_dataset}} object to access.} -} -\description{ -Contains sample data as a pointer to a Parquet file, and sample metadata -and spectrum metadata in table and json formats. -} -\details{ -Objects for this class can be created by \code{\link{create_sample_dataset}} -with the ondisk method. -} -\section{Slots}{ - -\describe{ -\item{\code{sample_data}}{Contains an \code{Arrow Dataset} R6 object pointing to the Parquet file.} - -\item{\code{sample_metadata}}{Contains a \code{datatable} with the sample metadata.} - -\item{\code{spectrum_metadata}}{Contains a \code{datatable} with the spectrum metadata.} - -\item{\code{sample_metadata_json}}{Contains a \code{character} with the sample metadata.} - -\item{\code{spectrum_metadata_json}}{Contains a \code{character} with the spectrum metadata.} -}} - -\section{Use the \code{\link{collect_one_sample_data}} to}{ - - store the sample data, metadata and spectrum metadata, in table and json formats. -} - diff --git a/man/sample_infos-class.Rd b/man/sample_infos-class.Rd index 3202596..a68404d 100644 --- a/man/sample_infos-class.Rd +++ b/man/sample_infos-class.Rd @@ -8,6 +8,8 @@ \alias{get_sample_metadata} \alias{get_sample_name,sample_infos-method} \alias{get_sample_name} +\alias{get_sample_id,sample_infos-method} +\alias{get_sample_id} \alias{get_analysis_name,sample_infos-method} \alias{get_analysis_name} \alias{get_sample_metadata_json,sample_infos-method} @@ -16,12 +18,16 @@ \alias{get_spectrum_metadata} \alias{get_spectrum_metadata_json,sample_infos-method} \alias{get_spectrum_metadata_json} +\alias{add_sample_id,sample_infos-method} +\alias{add_sample_id} \title{Class containing a sample's information} \usage{ \S4method{get_sample_metadata}{sample_infos}(obj) \S4method{get_sample_name}{sample_infos}(obj) +\S4method{get_sample_id}{sample_infos}(obj) + \S4method{get_analysis_name}{sample_infos}(obj) \S4method{get_sample_metadata_json}{sample_infos}(obj) @@ -29,14 +35,20 @@ \S4method{get_spectrum_metadata}{sample_infos}(obj) \S4method{get_spectrum_metadata_json}{sample_infos}(obj) + +\S4method{add_sample_id}{sample_infos}(obj, id) } \arguments{ \item{obj}{The \code{\link{sample_infos}} object to access.} + +\item{id}{The identifier of the sample result for which spectrum information is to be retrieved.} } \value{ \code{get_sample_metadata} returns a data.table object containing the sample metadata. -\code{get_sample_infos} returns a character object containing the sample name. +\code{get_sample_name} returns a character object containing the sample name. + +\code{get_sample_id} returns a character object containing the sample id. \code{get_analysis_name} returns a character object containing the analysis name. @@ -45,6 +57,8 @@ \code{get_spectrum_metadata} returns a data.table object containing the spectrum metadata of a sample. \code{get_spectrum_metadata_json} returns a json character object containing the spectrum metadata of a sample. + +\code{add_sample_id} returns a character object containing the sample id. } \description{ Contains sample metadata and spectrum metadata in table and json formats. @@ -58,6 +72,8 @@ Objects for this class are returned by \code{\link{get_sample_infos}}. \item \code{get_sample_name(sample_infos)}: Accessor method to obtain the sample name. +\item \code{get_sample_id(sample_infos)}: Accessor method to obtain the sample id. + \item \code{get_analysis_name(sample_infos)}: Accessor method to obtain the analysis name. \item \code{get_sample_metadata_json(sample_infos)}: Accessor method to obtain the sample metadata in json format. @@ -66,6 +82,8 @@ Objects for this class are returned by \code{\link{get_sample_infos}}. \item \code{get_spectrum_metadata_json(sample_infos)}: Accessor method to obtain the spectrum metadata in json format. +\item \code{add_sample_id(sample_infos)}: Method to add a sample ID to the sample_infos object. + }} \section{Slots}{ diff --git a/vignettes/articles/bpi.rds b/vignettes/articles/bpi.rds new file mode 100644 index 0000000..6323a4c Binary files /dev/null and b/vignettes/articles/bpi.rds differ diff --git a/vignettes/articles/data-filtration-tutorial.Rmd b/vignettes/articles/data-filtration-tutorial.Rmd new file mode 100644 index 0000000..7c1bbe3 --- /dev/null +++ b/vignettes/articles/data-filtration-tutorial.Rmd @@ -0,0 +1,458 @@ +--- +title: "Queries and filtration of Parquet files - full tutorial" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Queries and filtration of Parquet files - full tutorial} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r, include = FALSE} +#library(arrow) +#download.file("https://media.githubusercontent.com/media/leesulab/arcms-dataviz/main/cal1-full-long.parquet", "cal1.parquet") +# data = read_parquet("cal1.parquet") +data = readRDS("hdata.rds") +TIC = readRDS("tic.rds") +BPI = readRDS("bpi.rds") +EIC = readRDS("eic.rds") +library("fontawesome") +``` + +HRMS data converted with `arcMS` is stored as columnar files in the Apache Parquet format, offering high compression and fast access with the `arrow` library in R, and also because of its compatibility with recent libraries for peak detection like [DEIMoS](https://deimos.readthedocs.io/en/latest/) (compatible with IMS data). + +The Apache Parquet format can easily be read with many programming languages such as Python `r fa("python", fill = "steelblue")` or R `r fa("r-project", fill = "steelblue")`. + +Very small files can be obtained with this compressed data format, even when containing additional dimensions such as ion mobility, and in profile mode. A full data file in profile mode containing IMS, and HDMSE data (MS and MS/MS), is around 200-400 Mo. + +But the Parquet format and the Apache Arrow library offer another powerful feature: the whole data does not even need to be loaded in memory to access, filter and aggregate values, allowing for complex manipulation of big data with a very limited memory footprint... + +## `r fa("memory")` Loading data in memory + +With the `arrow` package, it is easy to load a file (here is an example file you can download: [cal1_sample.parquet](https://github.com/leesulab/arcms-dataviz/raw/refs/heads/main/cal1-full-long.parquet)) in the R environment as a dataframe/data.table: + +```{r warning=FALSE, message=FALSE} +library(arrow) +library(data.table) +library(dplyr) +library(duckdb) +library(plotly) +``` + +```{r eval=FALSE} +data = read_parquet("converted_file.parquet") +``` + + +```{r} +head(data) +``` + +The columns in this file are: + +| | | +|-----|-----| +| rt | retention time | +| scanid | an identifier for each scan/retention time | +| mslevel | Low or High collision energy in HDMSe mode (this column could be MS1/MS2 data) | +| mz | *m/z* ratio, in profile mode | +| intensity | intensity of each detected *m/z* | +| bin | drift time *bin* | +| dt | drift time | + +Loading the data is really fast and the dataframe can then be quickly sorted/arranged/aggregated as desired with modern libraries such as `data.table`. For example we can quickly get the Total Ion Chromatogram (after filtering the data to only keep low collision energy data - it would be similar to keep MS1 data and not keep MS2 data): + +```{r, eval=FALSE} +datalow = data[mslevel == 1, ] +TIC = datalow[, list(intensity = sum(intensity)), by=list(rt)] +``` + +```{r fig.width = 6} +plot_ly(TIC, + y=~intensity, + x=~rt, + type = 'scatter', + mode = 'lines', + line = list(color = 'rgba(0,0,0,1)', width = 1), + line = list(shape = 'spline', smoothing = 1) +) +``` + +But still, the loaded data for this example sample occupies \~1.9 Gb in memory, so a whole sequence of usually 10-30 samples (and even more when using triplicate injections) can quickly fill the available RAM of computers. + +## `r fa("database")` Querying data with Arrow + +The Arrow library can read parquet files without loading the whole file in memory... and this is where magic happened! {{< fa wand-magic-sparkles size=1.4xl >}} + +As described in `vignette("open-files")`, a parquet file can also be read on-disk in R with the `open_dataset()` function: + +```{r, eval = FALSE} +data_arrow = open_dataset("converted_file.parquet") +``` + +We just created a pointer to the Parquet file, but it was not loaded in memory yet. Data can then be filtered, rearranged, sorted and aggregated with the `dplyr` syntax, and only the resulting data will be loaded in RAM (with the `collect()` function): + +```{r, eval = FALSE, message=FALSE} +fulldatalow = data_arrow |> + filter(mslevel == 1) |> + arrange(scanid) |> + collect() +``` + +The block above would collect all the low energy data arranged by increasing scanid (an identifier for each retention time scan). + +To get an overview of the columns available in the file, their type and their content, we cannot use `str(data_arrow)` or `summary(data_arrow)` but we can use `glimpse(data_arrow)` from the `dplyr` package: + +```{r, eval=FALSE} +glimpse(data_arrow) +#> FileSystemDataset with 1 Parquet file +#> 57,980,927 rows x 7 columns +#> $ rt 0.007404283, 0.007404283, 0.00740428… +#> $ scanid 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … +#> $ mslevel > 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … +#> $ mz 556.2536, 556.2566, 556.2596, 556.26… +#> $ intensity 0, 5, 7, 0, 0, 1, 2, 0, 0, 3, 2, 0, … +#> $ bin 19, 19, 19, 19, 23, 23, 23, 23, 23, … +#> $ dt 1.349, 1.349, 1.349, 1.349, 1.633, 1… +#> Call `print()` for full schema details +``` + +We can also use the `summarise()` function after grouping the column of interest, for example to obtain the values of the mslevel column quickly: + +```{r, eval = FALSE} +data_arrow |> + group_by(mslevel) |> + summarise() |> + collect() +#> # A tibble: 2 × 1 +#> mslevel +#> +#> 1 1 +#> 2 2 +``` + + +## `r fa("chart-line")` Chromatograms + +### TIC + +Obtaining the same TIC as before is now just a matter of aggregating intensity values by retention time: + +```{r, eval = FALSE, message=FALSE} +TIC = data_arrow |> + filter(mslevel == "1") |> + group_by(rt) |> + summarise(int = sum(intensity)) |> + collect() +``` + +```{r} +format(object.size(TIC), unit = 'auto') +``` + + +This took around 1s and the resulting data is only 64 Kb! + +### BPI + +If we now want the Base Peak Chromatogram (BPI) instead of the TIC, we simply use the `max()` function to take the peak of maximum intensity at each retention time: + +```{r, eval = FALSE} +BPI = data_arrow |> + filter(mslevel == "1") |> + arrange(rt) |> + group_by(rt) |> + summarise(intensity = max(intensity)) |> + collect() +``` + +```{r fig.width = 6} +plot_ly(BPI, + y=~intensity, + x=~rt, + type = 'scatter', + mode = 'lines', + line = list(color = 'rgba(0,0,0,1)', width = 1), + line = list(shape = 'spline', smoothing = 1) +) +``` + +### Extracted Ion Chromatogram (EIC) + +Now if we want to keep the data around a selected *m/z* ratio to obtain an EIC (say around the m/z ratio of atrazine: 216.1016), we can again use the `filter` function (and choose a tolerance for the *m/z* window): + +```{r, eval = FALSE} +EIC = data_arrow |> + filter(mslevel == "1") |> + arrange(rt) |> + filter(mz > 216.0016 & mz < 216.2016) |> + group_by(rt, scanid) |> + summarise(intensity = sum(intensity)) |> + collect() + +EIC = as.data.table(EIC) + +``` + +```{r fig.width = 6} +plot_ly(EIC, + y=~intensity, + x=~rt, + type = 'scatter', + mode = 'lines', + line = list(color = 'rgba(0,0,0,1)', width = 1), + line = list(shape = 'spline', smoothing = 1) +) +``` + +## `r fa("chart-area")` Full 2D plots + +Now, how can we handle more data? For example, what if we want to plot a full 2D plot displaying *m/z* vs rt, or rt vs dt/bin, or *m/z* vs dt/bin? + +A first approach would be to simply aggregate data by two groups of choice, e.g. rt and mz, to get rid of the third parameter (here the dt bin): + +```{r, eval = FALSE} +rtmz = data_arrow |> + filter(mslevel == "1") |> + arrange(rt) |> + group_by(rt, mz) |> + summarise(intensity = sum(intensity)) |> + collect() + +rtmz = as.data.table(rtmz) +rtmz = rtmz[intensity != 0,] +rtmz = rtmz[order(rtmz$mz),] +rtmz = rtmz[order(rtmz$rt),] + +``` + +This is a bit longer and the resulting object is \~300 Mb, even after removing zero intensity values, and there are \~1 million lines of rt and mz values, which is way too large to obtain a matrix that can then be used for plotting an heatmap or contour plot with `plotly`. + +So le'ts try to apply some data reduction by binning close rt and mz values to obtain a smaller matrix, directly in the list of arrow commands with the `mutate` function: + +```{r eval = FALSE, message=FALSE} +rtmzbinned = data_arrow |> + filter(mslevel == "1") |> + arrange(rt) |> + group_by(rt, mz) |> + summarise(sum_int = sum(intensity)) |> + mutate(rt_binned = floor(rt/0.1)*0.1) |> + mutate(mz_binned = floor(mz/1)*1) |> + group_by(rt_binned, mz_binned) |> + summarise(sum_int_binned = sum(sum_int)) |> + collect() +``` + +The resulting object is now just several Mbs, but the query takes \~30s to 1 min depending on the value chosen for binning mz and rt values. + +Now this is where DuckDB enters! It is a database management system that can query Arrow data directly with an SQL interface, without any copy of the data. It allows users to make queries that are not yet available in Arrow with the dplyr syntax, and can process complex data types very efficiently. + +To use DuckDB, only one more line is needed in our dplyr pipeline! We just need to transfer the data (with zero-copy streaming) to the DuckDB system with the `to_duckdb()` function: + +```{r eval = FALSE, message=FALSE, warning=FALSE} +library(duckdb) + +rtmzbinned = data_arrow |> + to_duckdb() |> + filter(mslevel == "1") |> + mutate(rt_binned = floor(rt/0.1)*0.1) |> + mutate(mz_binned = floor(mz/1)*1) |> + group_by(rt_binned, mz_binned) |> + summarise(intensity = sum(intensity)) |> + collect() + +rtmzbinned = as.data.table(rtmzbinned) +rtmzbinned = setorderv(rtmzbinned, c("rt_binned", "mz_binned")) +``` + +This now runs in a few seconds... We can then prepare our matrix for 2D contour plot: + +```{r, eval = FALSE} +spreaddt = rtmzbinned |> pivot_wider(names_from = rt_binned, values_from = intensity) +spreaddt = spreaddt[order(spreaddt$mz_binned),] +setnafill(spreaddt, fill = 0) +spreadmatrix = as.matrix(spreaddt[,-1]) +``` +```{r, include = FALSE} +spreadmatrix = readRDS("mat.rds") +spreaddt = readRDS("spd.rds") +``` + +```{r fig.width = 6} +plot_ly( + x = as.numeric(colnames(spreaddt[,-1])), + y = spreaddt$mz_binned, + z = spreadmatrix, + type = "heatmap", + zmin = 0, + zmax = 100000 +) +``` + +The same data could be used for a 3D plot as well, as depicted in the miniature of the post. Zooming and hovering on this plot does not give accurate masses and it is just a raw preview of the data due to the binning with the `floor()` function, but we could make a new query to obtain the accurate data at specific rt and mz ranges, without any binning, e.g. around atrazine: + +```{r eval = FALSE, echo=FALSE, message=FALSE, warning=FALSE} +rtmzatra = data_arrow |> + to_duckdb() |> + filter(mslevel == "1") |> + filter(rt > 9.54 & rt < 9.8) |> + filter(mz > 215 & mz < 220) |> + group_by(rt, mz) |> + summarise(intensity = sum(intensity)) |> + collect() + +rtmzatra = as.data.table(rtmzatra) +rtmzatra = setorderv(rtmzatra, c("rt", "mz")) +spreaddt = rtmzatra |> pivot_wider(names_from = rt, values_from = intensity) +spreaddt = spreaddt[order(spreaddt$mz),] +setnafill(spreaddt, fill = 0) +spreadmatrix = as.matrix(spreaddt[,-1]) +``` + +```{r, include=FALSE} +spreaddt = readRDS("spdatra.rds") +spreadmatrix = readRDS("matatra.rds") +``` + + +```{r fig.width = 6} +plot_ly( + x = as.numeric(colnames(spreaddt[,-1])), + y = spreaddt$mz, + z = spreadmatrix, + type = "contour", + zmin = 0, + zmax = 100000 +) +``` + +## `r fa("chart-column")` MS spectra + +Plotting MS spectra is straightforward at this point: here's the high energy spectrum ("MS2") of atrazine in profile mode (knowing its scanid numbers around its corresponding retention time): + +```{r eval = FALSE, message=FALSE} +scanids = c(1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160) + +MS = data_arrow |> + filter(mslevel == "1") |> + filter(scanid %in% !!scanids) |> + group_by(mz) |> + summarise(intensity = sum(intensity)) |> + arrange(mz) |> + collect() +``` + +```{r include=FALSE} +MS = readRDS("msatra.rds") +``` + + +```{r fig.width = 6} +plot_ly(data = MS, + x = ~mz, + y = ~intensity, + type="scatter", + mode = "line") %>% + layout( + xaxis = list(title = "m/z"), + yaxis = list(title = "Intensity")) +``` + +Now if we want a clean spectrum filtered thanks to ion mobility, we need to know the drift time of the molecule of interest, and this is not always the case. In case drift time is not available, we can filter based on the *bin* parameter corresponding to mobility separation (1 to 200 bins for each *rt*). + +We can thus plot the IMS 2D trace for a given *rt,* or a list of *rt*s (here selected by their *scanid*): + +```{r eval=FALSE, message=FALSE} +scanids = c(1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160) + + IMStrace = data_arrow |> + to_duckdb() |> + filter(mslevel == "1") |> + filter(scanid %in% !!scanids) |> + group_by(bin, mz) |> + summarise(sum_int = sum(intensity)) |> + mutate(mz_binned = floor(mz/1)*1) |> + group_by(bin, mz_binned) |> + summarise(sum_int_binned = sum(sum_int)) |> + collect() + +IMStrace = IMStrace[order(IMStrace$bin),] +spreadIMS = IMStrace |> pivot_wider(names_from = bin, values_from = sum_int_binned) +spreadIMS = spreadIMS[order(spreadIMS$mz_binned),] +setnafill(spreadIMS, fill = 0) +spreadmatrixIMS = as.matrix(spreadIMS[,-1]) +``` + +```{r include=FALSE} +spreadIMS = readRDS("spdims.rds") +spreadmatrixIMS = readRDS("matims.rds") +``` + + +```{r fig.width = 6} +plot_ly( + x = as.numeric(colnames(spreadIMS[,-1])), + y = spreadIMS$mz_binned, + z = spreadmatrixIMS, + type = "heatmap", + zmin = 0, + zmax = 100000 +) %>% + layout( + xaxis = list(title = "bin"), + yaxis = list(title = "m/z")) +``` + +The *bin* value of atrazine is around 62. In this plot, we also see a molecule at *bin* \~50 and *m/z* \~174, probably an in-source fragment of atrazine thus showing a different drift time. + +### MS spectrum filtering + +Now that we found out that *bin* values of atrazine are \~55 to \~70, we can use the `filter()` function to only select these values fo interest: + +```{r eval=FALSE, message=FALSE} +scanids = c(1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160) +binsarray = c(58, 59, 60, 61, 62, 63, 64, 65, 66) + +MSf = data_arrow |> + filter(mslevel == "1") |> + filter(scanid %in% !!scanids) |> + filter(bin %in% binsarray) |> + group_by(mz) |> + summarise(intensity = sum(intensity)) |> + arrange(mz) |> + collect() +``` + +```{r, include=FALSE} +MSf = readRDS("msatraf.rds") +``` + + +```{r fig.width = 6} +plot_ly(data = MSf, + x = ~mz, + y = ~intensity, + type="scatter", + mode = "line") %>% + layout( + xaxis = list(title = "m/z"), + yaxis = list(title = "Intensity")) +``` + +Here we quickly combined several scans (retention times) and several bins corresponding to atrazine, which almost completely filtered out the other molecule. + +## `r fa("list")` Conclusion + +- The Apache Parquet file format, in addition to its high compression and light data files, can be read directly by the Arrow library without loading the data in memory. + +- The interoperability of Arrow, DuckDB and dplyr makes data queries extremely fast and easy to write in a few lines. + +- Parquet, Arrow and DuckDB are cross-platform and multi-language: they can be used with R, Python or Julia. + +- Thanks to all these possibilities, we developed a simple [R Shiny application](https://github.com/leesulab/arcms-dataviz) to visualize HRMS data directly in the browser, without requiring workstations with lots of RAM. diff --git a/vignettes/articles/eic.rds b/vignettes/articles/eic.rds new file mode 100644 index 0000000..a8de40a Binary files /dev/null and b/vignettes/articles/eic.rds differ diff --git a/vignettes/articles/hdata.rds b/vignettes/articles/hdata.rds new file mode 100644 index 0000000..96940e0 Binary files /dev/null and b/vignettes/articles/hdata.rds differ diff --git a/vignettes/articles/mat.rds b/vignettes/articles/mat.rds new file mode 100644 index 0000000..5b0d402 Binary files /dev/null and b/vignettes/articles/mat.rds differ diff --git a/vignettes/articles/matatra.rds b/vignettes/articles/matatra.rds new file mode 100644 index 0000000..d45b27a Binary files /dev/null and b/vignettes/articles/matatra.rds differ diff --git a/vignettes/articles/matims.rds b/vignettes/articles/matims.rds new file mode 100644 index 0000000..7de50d1 Binary files /dev/null and b/vignettes/articles/matims.rds differ diff --git a/vignettes/articles/msatra.rds b/vignettes/articles/msatra.rds new file mode 100644 index 0000000..28ef1e8 Binary files /dev/null and b/vignettes/articles/msatra.rds differ diff --git a/vignettes/articles/msatraf.rds b/vignettes/articles/msatraf.rds new file mode 100644 index 0000000..6b46199 Binary files /dev/null and b/vignettes/articles/msatraf.rds differ diff --git a/vignettes/articles/sample_metadata.rds b/vignettes/articles/sample_metadata.rds new file mode 100644 index 0000000..ccff360 Binary files /dev/null and b/vignettes/articles/sample_metadata.rds differ diff --git a/vignettes/articles/spd.rds b/vignettes/articles/spd.rds new file mode 100644 index 0000000..c3f4514 Binary files /dev/null and b/vignettes/articles/spd.rds differ diff --git a/vignettes/articles/spdatra.rds b/vignettes/articles/spdatra.rds new file mode 100644 index 0000000..b35848b Binary files /dev/null and b/vignettes/articles/spdatra.rds differ diff --git a/vignettes/articles/spdims.rds b/vignettes/articles/spdims.rds new file mode 100644 index 0000000..4be3dee Binary files /dev/null and b/vignettes/articles/spdims.rds differ diff --git a/vignettes/articles/spectrum_metadata.rds b/vignettes/articles/spectrum_metadata.rds new file mode 100644 index 0000000..5692468 Binary files /dev/null and b/vignettes/articles/spectrum_metadata.rds differ diff --git a/vignettes/articles/strsamplemeta.rds b/vignettes/articles/strsamplemeta.rds new file mode 100644 index 0000000..1d0e39b Binary files /dev/null and b/vignettes/articles/strsamplemeta.rds differ diff --git a/vignettes/articles/tic.rds b/vignettes/articles/tic.rds new file mode 100644 index 0000000..539bda9 Binary files /dev/null and b/vignettes/articles/tic.rds differ diff --git a/vignettes/collect-save-functions.Rmd b/vignettes/collect-save-functions.Rmd index 91d447c..460b34c 100644 --- a/vignettes/collect-save-functions.Rmd +++ b/vignettes/collect-save-functions.Rmd @@ -1,8 +1,8 @@ --- -title: "collect-save-functions" +title: "Collect and Save functions" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{collect-save-functions} + %\VignetteIndexEntry{Collect and Save functions} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/vignettes/distant-query.Rmd b/vignettes/distant-query.Rmd new file mode 100644 index 0000000..4c01356 --- /dev/null +++ b/vignettes/distant-query.Rmd @@ -0,0 +1,68 @@ +--- +title: "Distant query to Parquet file" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Distant query to Parquet file} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +A Parquet file can be hosted online and queried almost with the same methods shown in the `vignette("data-filtration-tutorial")` vignette. + +We can use DuckDB again, but need to create a database distant connection first before we can make our query in the dplyr syntax. + +```{r eval=FALSE} +library(DBI) +library(dplyr) +library(duckdb) +``` + +```{r eval=FALSE} +con = dbConnect(duckdb::duckdb(), dbdir=":memory:", read_only=FALSE) + +dbExecute(con, "INSTALL httpfs;") # run once to install the httpfs library +dbExecute(con, "LOAD httpfs;") + +dbExecute(con, + "CREATE VIEW cal1 AS + SELECT * FROM PARQUET_SCAN('https://github.com/leesulab/arcms-dataviz/raw/refs/heads/main/cal1-full-long.parquet'); +") +``` + +Check that the connection has been created: +```{r eval=FALSE} +dbListTables(con) +``` + +Then make the query: + +```{r eval=FALSE} +tic = tbl(con, "cal1") |> + filter(mslevel == 1) |> + group_by(rt) |> + summarise(intensity = sum(intensity)) |> + arrange(rt) |> + collect() +``` + +The TIC data has been downloaded and can be plotted as shown in other vignettes: + +```{r eval=FALSE} +plotly::plot_ly(tic, + y=~intensity, + x=~rt, + type = 'scatter', + mode = 'lines', + line = list(color = 'rgba(0,0,0,1)', width = 1), + line = list(shape = 'spline', smoothing = 1) +) +``` + + diff --git a/vignettes/hdata.rds b/vignettes/hdata.rds new file mode 100644 index 0000000..96940e0 Binary files /dev/null and b/vignettes/hdata.rds differ diff --git a/vignettes/open-files.Rmd b/vignettes/open-files.Rmd index de636fe..691d8bb 100644 --- a/vignettes/open-files.Rmd +++ b/vignettes/open-files.Rmd @@ -1,8 +1,8 @@ --- -title: "open-files" +title: "Open Parquet files" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{open-files} + %\VignetteIndexEntry{Open Parquet files} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -29,9 +29,10 @@ Files created by `arcMS` in the Parquet format can be opened in R with the Arrow Data can be loaded in RAM with the following commands: ```{r, include = FALSE} -library(arrow) -download.file("https://media.githubusercontent.com/media/leesulab/arcms-dataviz/main/cal1-full-long.parquet", "cal1.parquet") -data = read_parquet("cal1.parquet") +#library(arrow) +#download.file("https://media.githubusercontent.com/media/leesulab/arcms-dataviz/main/cal1-full-long.parquet", "cal1.parquet") +# data = read_parquet("cal1.parquet") +data = readRDS("hdata.rds") ``` ```{r, eval = FALSE} @@ -45,7 +46,7 @@ head(data) Data can then be filtered and quickly aggregated, e.g. to obtain TIC plot: -```{r} +```{r, eval = FALSE} library(data.table) data = as.data.table(data) datalow = data[mslevel == 1, ] @@ -53,19 +54,20 @@ TIC = datalow[, list(intensity = sum(intensity)), by=list(rt)] plot(TIC$rt, TIC$intensity, type = "l") ``` -To save RAM, data can also be manipulated directly on-disk thanks to the Arrow library (`open_dataset()` function): - -```{r, include = FALSE} -data = open_dataset("cal1.parquet") +```{r echo=FALSE} +TIC = readRDS("tic.rds") +plot(TIC$rt, TIC$intensity, type = "l") ``` +To save RAM, data can also be manipulated directly on-disk thanks to the Arrow library (`open_dataset()` function): + ```{r, eval = FALSE} data = open_dataset("converted_file.parquet") ``` Data can be filtered, rearranged, sorted and aggregated with the dplyr syntax, and only the resulting data will be loaded in RAM (with the `collect()` function): -```{r message=FALSE} +```{r, eval = FALSE, message=FALSE} library(dplyr) TIC = data |> @@ -80,12 +82,12 @@ TIC = data |> Parquet files can be opened as a DataFrame in Python with the pandas library: -```{python, include = FALSE, eval = FALSE} +```{python, include = FALSE, eval = FALSE, python.reticulate = FALSE} import pandas as pd data = pd.read_parquet("cal1.parquet") ``` -```{python, eval = FALSE} +```{python, eval = FALSE, python.reticulate = FALSE} import pandas as pd data = pd.read_parquet("converted_file.parquet") ms1 = data[data['mslevel'] == "1"] @@ -94,12 +96,12 @@ print(ms1) It can also be loaded as an Arrow object (ParquetDataset) with the pyarrow library: -```{python, include = FALSE, eval = FALSE} +```{python, include = FALSE, eval = FALSE, python.reticulate = FALSE} import pyarrow.parquet as pq data = pq.ParquetDataset("cal1.parquet") ``` -```{python, eval = FALSE} +```{python, eval = FALSE, python.reticulate = FALSE} import pyarrow.parquet as pq data = pq.ParquetDataset("converted_file.parquet") ``` @@ -110,31 +112,44 @@ The methods above only retrieve the main data from the Parquet file, but not met To simplify opening both data and metadata, functions are available in `arcMS` to load a Parquet file as a `sample_dataset` object, also allowing easy manipulation with some generic functions. -```{r} -library(arcMS) -dataset = create_sample_dataset("cal1.parquet") -``` - ```{r, eval= FALSE} library(arcMS) dataset = create_sample_dataset("converted_file.parquet") ``` -```{r} +Retrieve main data: + +```{r, eval = FALSE} data = get_sample_data(dataset) -head(data) ``` ```{r} +head(data) +``` + +Retrieve sample metadata: + +```{r, eval = FALSE} sample_metadata = get_sample_metadata(dataset) -str(sample_metadata) ``` +```{r, include = FALSE} +sample_metadata = readRDS("sample_metadata.rds") +``` ```{r} +str(sample_metadata) +``` + +Retrieve spectrum metadata: + +```{r, eval = FALSE} spectrum_metadata = get_spectrum_metadata(dataset) -str(spectrum_metadata) ``` +```{r, include = FALSE} +spectrum_metadata = readRDS("spectrum_metadata.rds") +``` ```{r} - +str(spectrum_metadata) ``` + diff --git a/vignettes/sample_metadata.rds b/vignettes/sample_metadata.rds new file mode 100644 index 0000000..ccff360 Binary files /dev/null and b/vignettes/sample_metadata.rds differ diff --git a/vignettes/spectrum_metadata.rds b/vignettes/spectrum_metadata.rds new file mode 100644 index 0000000..5692468 Binary files /dev/null and b/vignettes/spectrum_metadata.rds differ diff --git a/vignettes/tic.rds b/vignettes/tic.rds new file mode 100644 index 0000000..539bda9 Binary files /dev/null and b/vignettes/tic.rds differ