diff --git a/DESCRIPTION b/DESCRIPTION index c951252..09b9569 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -54,7 +54,7 @@ Suggests: omock (>= 0.3.0), covr, ggplot2, - visOmopResults (>= 0.4.0) + visOmopResults (>= 0.5.0) Config/testthat/edition: 3 Config/testthat/parallel: true Imports: @@ -65,7 +65,7 @@ Imports: dplyr, glue, lifecycle, - omopgenerics (>= 0.3.1), + omopgenerics (>= 0.4.1), PatientProfiles (>= 1.2.1), purrr, rlang, @@ -77,5 +77,4 @@ Depends: URL: https://OHDSI.github.io/OmopSketch/ BugReports: https://github.com/OHDSI/OmopSketch/issues VignetteBuilder: knitr -Remotes: - darwin-eu/omopgenerics + diff --git a/R/mockOmopSketch.R b/R/mockOmopSketch.R index d746c8b..3cf8198 100644 --- a/R/mockOmopSketch.R +++ b/R/mockOmopSketch.R @@ -48,10 +48,48 @@ mockOmopSketch <- function(con = NULL, omock::mockProcedureOccurrence(seed = seed) |> omock::mockVisitOccurrence(seed = seed) |> # Create device exposure table - empty (Eunomia also has it empty) - omopgenerics::emptyOmopTable("device_exposure") + omopgenerics::emptyOmopTable("device_exposure")|> + checkColumns() + # WHEN WE SUPORT LOCAL CDMs WE WILL HAVE TO ACCOUNT FOR THAT HERE cdm <- CDMConnector::copy_cdm_to(con = con, cdm = cdm, schema = writeSchema) return(cdm) } + +checkColumns <- function(cdm_local){ + info <- omopgenerics::omopTableFields() |> + dplyr::filter(.data$type == "cdm_table") |> + dplyr::mutate(cdm_datatype = dplyr::case_when( + .data$cdm_datatype == "integer" ~ "NA_integer_", + grepl("varchar", .data$cdm_datatype) ~ "NA_character_", + .default = "NA" + )) + for (table in names(cdm_local)){ + cols <- info |> + dplyr::filter(.data$cdm_table_name == table)|> + dplyr::select(cdm_field_name,cdm_datatype) + + missing_cols <- cols|> + dplyr::filter(!(cdm_field_name %in% colnames(cdm_local[[table]]))) + + if (nrow(missing_cols) > 0) { + + missing_tbl <- tibble::tibble( + !!!rlang::set_names( + lapply(missing_cols$cdm_datatype, function(datatype) { + eval(parse(text = datatype)) + }), + missing_cols$cdm_field_name + ) + ) + + cdm_local[[table]] <- dplyr::bind_cols(cdm_local[[table]], missing_tbl) + + } + } + return(cdm_local) +} + + diff --git a/R/plotConceptSetCounts.R b/R/plotConceptSetCounts.R index d908293..f59e172 100644 --- a/R/plotConceptSetCounts.R +++ b/R/plotConceptSetCounts.R @@ -54,13 +54,13 @@ plotConceptSetCounts <- function(result, )) } - result1 <- result |> omopgenerics::splitAdditional() + result1 <- result |> omopgenerics::splitAll() # Detect if there are several time intervals if("time_interval" %in% colnames(result1)){ # Line plot where each concept is a different line p <- result1 |> dplyr::filter(.data$time_interval != "overall") |> - omopgenerics::uniteAdditional(cols = c("time_interval", "standard_concept_name", "standard_concept_id", "source_concept_name", "source_concept_id", "domain_id")) |> + omopgenerics::pivotEstimates() |> visOmopResults::scatterPlot(x = "time_interval", y = "count", line = TRUE, @@ -71,17 +71,28 @@ plotConceptSetCounts <- function(result, colour = colour) }else{ if("standard_concept_name" %in% colnames(result1)){ - p <- result |> + p <- result1 |> + omopgenerics::pivotEstimates() |> visOmopResults::barPlot(x = c("standard_concept_name", "standard_concept_id"), y = "count", facet = facet, colour = colour) + p$data <- p$data |> + dplyr::mutate( + standard_concept_name_standard_concept_id = factor( + .data$standard_concept_name_standard_concept_id, + levels = c("overall - overall", sort(setdiff(.data$standard_concept_name_standard_concept_id, "overall - overall"))) + ) + ) + }else{ - p <- result |> + p <- result1 |> visOmopResults::barPlot(x = "codelist_name", y = "count", facet = facet, colour = colour) + p$data <- p$data |> + dplyr::arrange(.data$codelist_name) } p <- p + ggplot2::labs( diff --git a/R/plotInObservation.R b/R/plotInObservation.R index 750297e..f79b930 100644 --- a/R/plotInObservation.R +++ b/R/plotInObservation.R @@ -59,8 +59,7 @@ plotInObservation <- function(result, # plot if(length(unique(result$additional_level)) > 1 ){ - result |> - dplyr::mutate(additional_level = as.character(gsub("-01$","",as.Date(gsub(" to.*","",.data$additional_level))))) |> + p <- result |> dplyr::filter(.data$estimate_name == "count") |> visOmopResults::scatterPlot( x = "time_interval", @@ -78,6 +77,21 @@ plotInObservation <- function(result, y = variable, x = "Date" ) + p$data <- p$data |> + dplyr::arrange(.data$time_interval) |> + dplyr::mutate( + show_label = seq_along(.data$time_interval) %% ceiling(nrow(p$data) / 20) == 0 + ) + + p <- p + + ggplot2::scale_x_discrete( + breaks = p$data$time_interval[p$data$show_label] + ) + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, size = 8), + plot.margin = ggplot2::margin(t = 5, r = 5, b = 30, l = 5) + ) + p }else{ result |> dplyr::filter(.data$estimate_name == "count") |> diff --git a/R/plotRecordCount.R b/R/plotRecordCount.R index 818122c..3729960 100644 --- a/R/plotRecordCount.R +++ b/R/plotRecordCount.R @@ -59,6 +59,30 @@ plotRecordCount <- function(result, y = "Number records", x = "Date" ) + p$data <- p$data |> + dplyr::arrange(.data$time_interval) |> + dplyr::group_by(.data$omop_table) |> + dplyr::mutate( + show_label = if (dplyr::cur_group_id() == 1) { + seq_along(.data$time_interval) %% ceiling(dplyr::n() / 20) == 0 + } else { + FALSE + } + ) |> + dplyr::ungroup() + + # Modify the plot + p <- p + + ggplot2::scale_x_discrete( + breaks = p$data$time_interval[p$data$show_label], + labels = p$data$time_interval[p$data$show_label] + ) + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, size = 8), + plot.margin = ggplot2::margin(t = 5, r = 5, b = 30, l = 5) + ) + + }else{ p <- result |> visOmopResults::barPlot(x = "variable_name", @@ -70,5 +94,7 @@ plotRecordCount <- function(result, x = "" ) } + p } + diff --git a/README.Rmd b/README.Rmd index f68aa45..02c4676 100644 --- a/README.Rmd +++ b/README.Rmd @@ -16,13 +16,10 @@ knitr::opts_chunk$set( # OmopSketch OmopSketch website -[![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) -[![R-CMD-check](https://github.com/OHDSI/OmopSketch/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/OHDSI/OmopSketch/actions/workflows/R-CMD-check.yaml) -[![CRAN status](https://www.r-pkg.org/badges/version/OmopSketch)](https://CRAN.R-project.org/package=OmopSketch) -[![Codecov test coverage](https://codecov.io/gh/OHDSI/OmopSketch/branch/main/graph/badge.svg)](https://app.codecov.io/gh/OHDSI/OmopSketch?branch=main) - -### WARNING: this package is under-development and has only been tested using mock data +[![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) [![R-CMD-check](https://github.com/OHDSI/OmopSketch/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/OHDSI/OmopSketch/actions/workflows/R-CMD-check.yaml) [![CRAN status](https://www.r-pkg.org/badges/version/OmopSketch)](https://CRAN.R-project.org/package=OmopSketch) [![Codecov test coverage](https://codecov.io/gh/OHDSI/OmopSketch/branch/main/graph/badge.svg)](https://app.codecov.io/gh/OHDSI/OmopSketch?branch=main) + + The goal of OmopSketch is to characterise and visualise an OMOP CDM instance to asses if it meets the necessary criteria to answer a specific clinical question and conduct a certain study. @@ -48,16 +45,19 @@ con <- dbConnect(duckdb(), eunomia_dir()) cdm <- cdmFromCon(con = con, cdmSchema = "main", writeSchema = "main") cdm ``` + ### Snapshot + We first create a snapshot of our database. This will allow us to track when the analysis has been conducted and capture details about the CDM version or the data release. + ```{r} summariseOmopSnapshot(cdm) |> tableOmopSnapshot(type = "flextable") ``` - ### Characterise the clinical tables -Once we have collected the snapshot information, we can start characterising the clinical tables of the CDM. By using `summariseClinicalRecords()` and `tableClinicalRecords()`, we can easily visualise the main characteristics of specific clinical tables. + +Once we have collected the snapshot information, we can start characterising the clinical tables of the CDM. By using `summariseClinicalRecords()` and `tableClinicalRecords()`, we can easily visualise the main characteristics of specific clinical tables. ```{r} summariseClinicalRecords(cdm, c("condition_occurrence", "drug_exposure")) |> @@ -67,45 +67,45 @@ summariseClinicalRecords(cdm, c("condition_occurrence", "drug_exposure")) |> We can also explore trends in the clinical table records over time. ```{r} -summariseRecordCount(cdm, c("condition_occurrence", "drug_exposure")) |> - plotRecordCount(facet = "omop_table") +summariseRecordCount(cdm, c("condition_occurrence", "drug_exposure"), interval = "years") |> + plotRecordCount(facet = "omop_table", colour = "cdm_name") ``` + ### Characterise the observation period + After visualising the main characteristics of our clinical tables, we can explore the observation period details. OmopSketch provides several functions to have an overview the dataset study period. Using `summariseInObservation()` and `plotInObservation()`, we can gather information on the number of records per year. ```{r} -summariseInObservation(cdm$observation_period, output = "records") |> - plotInObservation() +summariseInObservation(cdm$observation_period, output = "records", interval = "years") |> + plotInObservation(colour = "cdm_name") ``` -You can also visualise and explore the characteristics of the observation period per each individual in the database using `summariseObservationPeriod()`. + +You can also visualise and explore the characteristics of the observation period per each individual in the database using `summariseObservationPeriod()`. + ```{r} summariseObservationPeriod(cdm$observation_period) |> tableObservationPeriod(type = "flextable") ``` Or if visualisation is preferred, you can easily build a histogram to explore how many participants have more than one observation period. + ```{r} summariseObservationPeriod(cdm$observation_period) |> - plotObservationPeriod() + plotObservationPeriod(colour = "observation_period_ordinal") ``` ### Characterise the concepts + OmopSketch also provides functions to explore some of (or all) the concepts in the dataset. + ```{r} acetaminophen <- c(1125315, 1127433, 1127078) summariseConceptSetCounts(cdm, conceptSet = list("acetaminophen" = acetaminophen)) |> filter(variable_name == "Number records") |> - plotConceptSetCounts() + plotConceptSetCounts(colour = "codelist_name") ``` -### Characterise the population -Finally, OmopSketch can also help us to characterise the population at the start and end of the observation period. -```{r} -summarisePopulationCharacteristics(cdm) |> - tablePopulationCharacteristics(type = "flextable") -``` As seen, OmopSketch offers multiple functionalities to provide a general overview of a database. Additionally, it includes more tools and arguments that allow for deeper exploration, helping to assess the database's suitability for specific research studies. For further information, please refer to the vignettes. - diff --git a/README.md b/README.md index e939a3f..ab38493 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,8 @@ experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](h status](https://www.r-pkg.org/badges/version/OmopSketch)](https://CRAN.R-project.org/package=OmopSketch) [![Codecov test coverage](https://codecov.io/gh/OHDSI/OmopSketch/branch/main/graph/badge.svg)](https://app.codecov.io/gh/OHDSI/OmopSketch?branch=main) - -### WARNING: this package is under-development and has only been tested using mock data + The goal of OmopSketch is to characterise and visualise an OMOP CDM instance to asses if it meets the necessary criteria to answer a @@ -69,7 +68,6 @@ version or the data release. ``` r summariseOmopSnapshot(cdm) |> tableOmopSnapshot(type = "flextable") -#> ! Results have not been suppressed. ``` @@ -84,35 +82,14 @@ visualise the main characteristics of specific clinical tables. ``` r summariseClinicalRecords(cdm, c("condition_occurrence", "drug_exposure")) |> tableClinicalRecords(type = "flextable") -#> ℹ Summarising table counts -#> ℹ The following estimates will be computed: -#> → Start summary of data, at 2024-09-25 12:14:06.676817 -#> -#> ✔ Summary finished, at 2024-09-25 12:14:06.815944 -#> ℹ Summarising records per person -#> ℹ The following estimates will be computed: -#> • records_per_person: mean, sd, median, q25, q75, min, max -#> ! Table is collected to memory as not all requested estimates are supported on -#> the database side -#> → Start summary of data, at 2024-09-25 12:14:07.908258 -#> -#> ✔ Summary finished, at 2024-09-25 12:14:07.955041 -#> ℹ Summarising in_observation, standard, domain_id, and type information -#> ℹ Summarising table counts -#> ℹ The following estimates will be computed: -#> → Start summary of data, at 2024-09-25 12:14:11.725276 -#> -#> ✔ Summary finished, at 2024-09-25 12:14:11.877293 -#> ℹ Summarising records per person -#> ℹ The following estimates will be computed: -#> • records_per_person: mean, sd, median, q25, q75, min, max -#> ! Table is collected to memory as not all requested estimates are supported on -#> the database side -#> → Start summary of data, at 2024-09-25 12:14:12.808874 -#> -#> ✔ Summary finished, at 2024-09-25 12:14:12.850686 -#> ℹ Summarising in_observation, standard, domain_id, and type information -#> ! Results have not been suppressed. +#> ℹ Adding variables of interest to condition_occurrence. +#> ℹ Summarising records per person in condition_occurrence. +#> ℹ Summarising condition_occurrence: `in_observation`, `standard_concept`, +#> `source_vocabulary`, `domain_id`, and `type_concept`. +#> ℹ Adding variables of interest to drug_exposure. +#> ℹ Summarising records per person in drug_exposure. +#> ℹ Summarising drug_exposure: `in_observation`, `standard_concept`, +#> `source_vocabulary`, `domain_id`, and `type_concept`. ``` @@ -120,47 +97,47 @@ summariseClinicalRecords(cdm, c("condition_occurrence", "drug_exposure")) |> We can also explore trends in the clinical table records over time. ``` r -summariseRecordCount(cdm, c("condition_occurrence", "drug_exposure")) |> - plotRecordCount(facet = "omop_table") -#> ! The following column type were changed: -#> • variable_level: from double to character +summariseRecordCount(cdm, c("condition_occurrence", "drug_exposure"), interval = "years") |> + plotRecordCount(facet = "omop_table", colour = "cdm_name") ``` - \### -Characterise the observation period After visualising the main -characteristics of our clinical tables, we can explore the observation -period details. OmopSketch provides several functions to have an -overview of the dataset study period. + + +### Characterise the observation period + +After visualising the main characteristics of our clinical tables, we +can explore the observation period details. OmopSketch provides several +functions to have an overview the dataset study period. Using `summariseInObservation()` and `plotInObservation()`, we can gather information on the number of records per year. ``` r -summariseInObservation(cdm$observation_period, output = "records") |> - plotInObservation() -#> ! The following column type were changed: -#> • variable_level: from double to character +summariseInObservation(cdm$observation_period, output = "records", interval = "years") |> + plotInObservation(colour = "cdm_name") +#> `result_id` is not present in result. +#> `result_id` is not present in result. ``` - You -can also visualise and explore the characteristics of the observation -period per each individual in the database using + + +You can also visualise and explore the characteristics of the +observation period per each individual in the database using `summariseObservationPeriod()`. ``` r summariseObservationPeriod(cdm$observation_period) |> tableObservationPeriod(type = "flextable") -#> ! Results have not been suppressed. ``` -Or if visualisation is prefered, you can easily build a histogram to +Or if visualisation is preferred, you can easily build a histogram to explore how many participants have more than one observation period. ``` r summariseObservationPeriod(cdm$observation_period) |> - plotObservationPeriod() + plotObservationPeriod(colour = "observation_period_ordinal") ``` @@ -174,561 +151,15 @@ concepts in the dataset. acetaminophen <- c(1125315, 1127433, 1127078) summariseConceptSetCounts(cdm, conceptSet = list("acetaminophen" = acetaminophen)) |> - filter(estimate_name == "record_count") |> - plotConceptCounts() -#> ℹ Getting use of codes from acetaminophen -#> ! The following column type were changed: -#> • variable_name: from integer to character + filter(variable_name == "Number records") |> + plotConceptSetCounts(colour = "codelist_name") +#> Warning: ! `codelist` contains numeric values, they are casted to integers. +#> ℹ Searching concepts from domain drug in drug_exposure. +#> ℹ Counting concepts ``` -### Characterise the population - -Finally, OmopSketch can also help us to characterise the population at -the start and end of the observation period. - -``` r -summarisePopulationCharacteristics(cdm) |> - tablePopulationCharacteristics(type = "flextable") -#> Warning: ! 1 casted column in og_015_1727262876 (cohort_set) as do not match expected -#> column type: -#> • `cohort_definition_id` from numeric to integer -#> Warning: ! 1 column in og_015_1727262876 do not match expected column type: -#> • `cohort_definition_id` is numeric but expected integer -#> ! cohort columns will be reordered to match the expected order: -#> cohort_definition_id, subject_id, cohort_start_date, and cohort_end_date. -#> ℹ Building new trimmed cohort -#> Warning: ! 1 column in tmp_011_og_017_1727262877 do not match expected column type: -#> • `cohort_definition_id` is numeric but expected integer -#> Creating initial cohort -#> ! cohort columns will be reordered to match the expected order: -#> cohort_definition_id, subject_id, cohort_start_date, and cohort_end_date. -#> ! cohort columns will be reordered to match the expected order: -#> cohort_definition_id, subject_id, cohort_start_date, and cohort_end_date. -#> ✔ Cohort trimmed -#> ℹ adding demographics columns -#> -#> ℹ summarising data -#> -#> ✔ summariseCharacteristics finished! -#> -#> ! The following column type were changed: -#> • variable_name: from integer to character -#> ! Results have not been suppressed. -``` - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Variable nameVariable levelEstimate name - Database name -
Synthea synthetic health database
Number records-N2,694
Number subjects-N2,694
Cohort start date-Median [Q25 - Q75]1961-03-18 [1950-07-13 - 1970-08-29]
Range1908-09-22 to 1986-11-03
Cohort end date-Median [Q25 - Q75]2018-12-14 [2018-08-02 - 2019-04-06]
Range1945-07-20 to 2019-07-03
Age at start-Median [Q25 - Q75]0 [0 - 0]
Mean (SD)0.00 (0.00)
Range0 to 0
Age at end-Median [Q25 - Q75]57 [47 - 67]
Range31 to 110
SexFemaleN%1,373 (50.97)
MaleN%1,321 (49.03)
Prior observation-Median [Q25 - Q75]0 [0 - 0]
Mean (SD)0.00 (0.00)
Range0 to 0
Future observation-Median [Q25 - Q75]20,870 [17,494 - 24,701]
Mean (SD)21,601.60 (5,460.69)
Range11,396 to 40,348
-
- As seen, OmopSketch offers multiple functionalities to provide a general overview of a database. Additionally, it includes more tools and arguments that allow for deeper exploration, helping to assess the diff --git a/man/figures/README-unnamed-chunk-3-1.png b/man/figures/README-unnamed-chunk-3-1.png index 5e3d1f5..e25b42c 100644 Binary files a/man/figures/README-unnamed-chunk-3-1.png and b/man/figures/README-unnamed-chunk-3-1.png differ diff --git a/man/figures/README-unnamed-chunk-4-1.png b/man/figures/README-unnamed-chunk-4-1.png index 517c6dd..9a61c65 100644 Binary files a/man/figures/README-unnamed-chunk-4-1.png and b/man/figures/README-unnamed-chunk-4-1.png differ diff --git a/man/figures/README-unnamed-chunk-5-1.png b/man/figures/README-unnamed-chunk-5-1.png index 2279da3..582a986 100644 Binary files a/man/figures/README-unnamed-chunk-5-1.png and b/man/figures/README-unnamed-chunk-5-1.png differ diff --git a/man/figures/README-unnamed-chunk-6-1.png b/man/figures/README-unnamed-chunk-6-1.png index 5159d17..bc9460b 100644 Binary files a/man/figures/README-unnamed-chunk-6-1.png and b/man/figures/README-unnamed-chunk-6-1.png differ diff --git a/man/figures/README-unnamed-chunk-7-1.png b/man/figures/README-unnamed-chunk-7-1.png index 99bfb66..4bf5e87 100644 Binary files a/man/figures/README-unnamed-chunk-7-1.png and b/man/figures/README-unnamed-chunk-7-1.png differ diff --git a/man/figures/README-unnamed-chunk-8-1.png b/man/figures/README-unnamed-chunk-8-1.png index eea9611..0b205a1 100644 Binary files a/man/figures/README-unnamed-chunk-8-1.png and b/man/figures/README-unnamed-chunk-8-1.png differ diff --git a/man/figures/README-unnamed-chunk-9-1.png b/man/figures/README-unnamed-chunk-9-1.png index b5f0a1e..4cb6c63 100644 Binary files a/man/figures/README-unnamed-chunk-9-1.png and b/man/figures/README-unnamed-chunk-9-1.png differ