Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add names for bundles and test they are consistent #17

Merged
merged 9 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@ Encoding: UTF-8
LazyData: true
Imports:
dplyr (>= 1.0),
glue (>= 1.0),
purrr (>= 1.0),
readr (>= 2.0)
readr (>= 2.0),
vroom (>= 1.0)
Suggests:
devtools,
usethis,
testthat (>= 3.0.0),
lintr,
pak,
pkgload
pkgload,
lintr,
stringr,
testthat (>= 3.0.0),
usethis
Config/testthat/edition: 3
RoxygenNote: 7.3.2
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,7 @@ export(available_bundles)
export(concept_by_bundle)
importFrom(dplyr,mutate)
importFrom(dplyr,tibble)
importFrom(glue,glue)
importFrom(purrr,map_dfr)
importFrom(readr,read_csv)
importFrom(vroom,problems)
38 changes: 11 additions & 27 deletions R/bundles.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#' @importFrom dplyr tibble mutate
#' @importFrom purrr map_dfr
#' @importFrom readr read_csv
#' @importFrom glue glue


#' @title Get available bundles for a version
#'
Expand All @@ -13,31 +15,13 @@
#' available_bundles()
#' available_bundles("0.1")
available_bundles <- function(version = "latest") {
raw_dir <- .get_raw_dir(version = version)
directories <- list.dirs(raw_dir, full.names = TRUE)
domain_directories <- directories[directories != raw_dir]

purrr::map_dfr(domain_directories, .build_concepts_from_directory) |>
mutate(version = version)
}
raw_dir <- get_raw_dir(version = version)
directories <- dir(raw_dir, full.names = TRUE)
bundle_name_paths <- file.path(directories, "bundle_names.csv")

.get_raw_dir <- function(version, ...) {
if (version != "latest") warning("Versioning not yet implemented, using version = 'latest'")

system.file("data-raw", ..., package = "omopbundles", mustWork = TRUE)
}

.build_concepts_from_directory <- function(directory) {
concept_files <- list.files(directory)
concept_name <- NULL

dplyr::tibble(
id = concept_files,
concept_name = concept_files,
domain = basename(directory)
) |>
dplyr::mutate(concept_name = sub("\\.csv$", "", concept_name)) |>
dplyr::mutate(concept_name = gsub("_", " ", concept_name))
purrr::map_dfr(bundle_name_paths, parse_bundle_names) |>
mutate(version = version)
}


Expand All @@ -52,12 +36,12 @@ available_bundles <- function(version = "latest") {
#' @export
#' @examples
#' # Usage with available_bundles, from a single row
#' smoking_info <- available_bundles() |> dplyr::filter(concept_name == "smoking")
#' concept_by_bundle(domain = smoking_info$domain, id = smoking_info$id, version = smoking_info$version)
#' smoking <- available_bundles() |> dplyr::filter(concept_name == "Smoking")
#' concept_by_bundle(domain = smoking$domain, id = smoking$id, version = smoking$version)
#' # Using if you know the details directly
#' concept_by_bundle(domain = "observation", id = "smoking.csv")
#' concept_by_bundle(domain = "observation", id = "smoking")
concept_by_bundle <- function(domain, id, version = "latest") {
.get_raw_dir(version = version, domain, id) |>
get_raw_dir(version = version, domain, "bundles", glue::glue("{id}.csv")) |>
readr::read_csv(show_col_types = FALSE) |>
dplyr::mutate(domain = domain)
}
74 changes: 74 additions & 0 deletions R/raw-data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#' @importFrom dplyr mutate
#' @importFrom purrr map_dfr
#' @importFrom readr read_csv
#' @importFrom glue glue
#' @importFrom vroom problems


# The package will store releases internally, and not in raw data
# Until that happens, this will duplicate the bundle code

#' @title Get available raw bundles
#'
#' @description If a bundle has multiple names, then the id will be duplicated across rows
#'
#' @return dataframe that contains a "concept_name" and a "domain" column for each available concept
stefpiatek marked this conversation as resolved.
Show resolved Hide resolved
#' @keywords internal
raw_bundles <- function() {
raw_dir <- get_raw_dir()
directories <- dir(raw_dir, full.names = TRUE)
bundle_name_paths <- file.path(directories, "bundle_names.csv")

purrr::map_dfr(bundle_name_paths, parse_bundle_names)
}


get_raw_dir <- function(..., version = "latest") {
if (version != "latest") warning("Versioning not yet implemented, using version = 'latest'")

file_path <- system.file("data-raw", ..., package = "omopbundles")

if (!file.exists(file_path)) {
path <- paste(..., sep = "/")
milanmlft marked this conversation as resolved.
Show resolved Hide resolved
stop(glue::glue("File not found in raw data, path given: {path}"))
}

file_path
}


parse_bundle_names <- function(bundle_name_path) {
bundle_name <- bundle_name_path |>
dirname() |>
basename()

readr::read_csv(bundle_name_path, col_types = "cc") |>
mutate(domain = bundle_name)
}



#' @title Get concepts for a single bundle row
#'
#' @description Retrieves concept data for a specific bundle.
#'
#' @param domain The domain of the bundle.
#' @param id The ID of the bundle.
#' @return A data frame with the concept data.
stefpiatek marked this conversation as resolved.
Show resolved Hide resolved
#' @keywords internal
raw_concept_by_bundle <- function(domain, id) {
file <- get_raw_dir(domain, "bundles", glue::glue("{id}.csv"))

concepts <- file |>
readr::read_csv(show_col_types = FALSE)

# Check for parsing problems
parsing_problems <- vroom::problems(concepts)
if (nrow(parsing_problems) > 0) {
warning(glue::glue("Warning while parsing: {file}"))
warning(parsing_problems)
}

concepts |>
dplyr::mutate(domain = domain)
}
35 changes: 35 additions & 0 deletions inst/data-raw/measurement/bundle_names.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
id,concept_name
HBA1c,HBA1c
LDH,LDH
antibodies_to_hiv,Antibodies to HIV
antigens,Antigens
blood_antibodies,Blood antibodies
bone_profile,Bone profile
covid_antibody_levels,Covid antibody levels
eGFR,eGFR
full_blood_count_differential,Full blood count differential
haemoglobin,Haemoglobin
haptoglobin,Haptoglobin
height_and_weight,Height and weight
hepatitis_b_screening,Hepatitis B screening
hepatitis_c_screening,Hepatitis C screening
immunofixation_chains,Immunofixation chains
immunoglobulins,Immunoglobulins
light_chains,Light chains
liver_function_test,Liver function test
liver_profile,Liver profile
monoclonal_bands,Monoclonal bands
plasma_viscosity,Plasma viscosity
renal_profile,Renal profile
serum_albumin,Serum albumin
serum_beta_2_microglobulin,Serum beta 2 microglobulin
serum_corrected_calcium,Serum corrected calcium
serum_creatinine,Serum creatinine
serum_glucose,Serum glucose
serum_immunofixation,Serum immunofixation
serum_urea,Serum urea
thyroid_profile,Thyroid profile
total_protein,Total protein
urea_electrolyte,Urea electrolyte
urine_albumin,Urine albumin
urine_bence_jones_protein,Urine bence jones protein
10 changes: 10 additions & 0 deletions inst/data-raw/measurement/bundles/haemoglobin.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
concept_id,allowed,per_person,min,max
3031973,T,0,0,
3037653,T,0,0,
3029071,T,0,0,
42869613,T,0,0,
3045566,T,0,0,
3046405,T,0,0,
3046708,T,0,0,
3045566,T,0,0,
3046405,T,0,0,
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
concept_id,allowed,per_person,min,max
3012336,T,0,
42868683,T,0,
3012336,T,0,,
42868683,T,0,,
10 changes: 0 additions & 10 deletions inst/data-raw/measurement/haemoglobin.csv

This file was deleted.

2 changes: 2 additions & 0 deletions inst/data-raw/observation/bundle_names.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
id,concept_name
smoking,Smoking
2 changes: 2 additions & 0 deletions inst/data-raw/race/bundle_names.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
id,concept_name
snomed_race,Race
2 changes: 2 additions & 0 deletions inst/data-raw/unknown/bundle_names.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
id,concept_name
indices_of_deprivation,Indices of deprivation
8 changes: 4 additions & 4 deletions man/concept_by_bundle.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions man/raw_bundles.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions man/raw_concept_by_bundle.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 1 addition & 7 deletions renv.lock
Original file line number Diff line number Diff line change
Expand Up @@ -482,13 +482,7 @@
"glue": {
"Package": "glue",
"Version": "1.7.0",
"Source": "Repository",
"Repository": "CRAN",
"Requirements": [
"R",
"methods"
],
"Hash": "e0b3a53876554bd45879e596cdb10a52"
"Source": "Repository"
},
"highr": {
"Package": "highr",
Expand Down
15 changes: 5 additions & 10 deletions tests/testthat/test-bundles.R
Original file line number Diff line number Diff line change
@@ -1,36 +1,31 @@
library(dplyr)
library(omopbundles)
library(testthat)


test_that("available_bundles isn't empty and have correct columns", {
result <- omopbundles::available_bundles()
expect_true(nrow(result) > 0, info = "The dataframe should not be empty")
hiv_ab <- filter(result, concept_name == "antibodies to hiv")
hiv_ab <- dplyr::filter(result, concept_name == "Antibodies to HIV")
expect_equal(hiv_ab$version, "latest")
expect_equal(hiv_ab$id, "antibodies_to_hiv.csv")
expect_equal(hiv_ab$id, "antibodies_to_hiv")
expect_equal(hiv_ab$domain, "measurement")

})

test_that("Smoking exists as an observation", {
result <- available_bundles() |>
dplyr::filter(concept_name == "smoking")
dplyr::filter(concept_name == "Smoking")

expect_true(nrow(result) == 1, info = "Smoking should only exist as a single row")
expect_equal(result$domain, "observation")
})

test_that("Concept by bundle works with character values", {
smoking_concepts <- omopbundles::concept_by_bundle(domain = "observation", id = "smoking.csv")
smoking_concepts <- concept_by_bundle(domain = "observation", id = "smoking")
expect_true(nrow(smoking_concepts) > 1, info = "Smoking should have multiple concepts")
expect_false(any(is.na(smoking_concepts$concept_id)), info = "Concept ids should not be NA")
expect_true(all(smoking_concepts$domain == "observation"), info = "Domain should be set correctly")
})

test_that("Available bundles and concept_by_bundle play nicely together", {
smoking_bundle <- available_bundles() |>
dplyr::filter(concept_name == "smoking")
dplyr::filter(concept_name == "Smoking")

smoking_concepts <- concept_by_bundle(smoking_bundle$domain, smoking_bundle$id)

Expand Down
42 changes: 42 additions & 0 deletions tests/testthat/test-raw-data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
extract_bundle_details <- function(path) {
filename <- stringr::str_remove(basename(path), "\\.csv$")

# get the domain
parts <- stringr::str_split(path, "/")[[1]]
domain <- parts[length(parts) - 2]

list(id = filename, domain = domain)
}

assert_bundle_has_name <- function(bundle) {
bundle_name_file <- get_raw_dir(bundle$domain, "bundle_names.csv")
bundle_names <- read_csv(bundle_name_file, show_col_types = FALSE)

expect_true(bundle$id %in% bundle_names$id,
glue::glue("{bundle$id} should at least one name in: {bundle$domain}/bundle_names.csv"))
}

test_that("All raw bundles have at least one name", {
raw_dir <- get_raw_dir()

concept_files <- Sys.glob(file.path(raw_dir, "*", "bundles", "*.csv"))
bundle_ids <- purrr::map(concept_files, extract_bundle_details)
purrr::walk(bundle_ids, assert_bundle_has_name)
})



test_that("All raw bundle names map to a bundle file that has at least one concept", {
bundles <- raw_bundles()

# Ensure bundles dataframe is not empty
expect_true(nrow(bundles) > 0)

apply(bundles, 1, function(bundle) {
concepts <- raw_concept_by_bundle(bundle["domain"], bundle["id"])

# Check that at least one concept
expect_true(nrow(concepts) > 0)
})

})