Skip to content

Commit

Permalink
add arachne folder
Browse files Browse the repository at this point in the history
  • Loading branch information
ablack3 committed Jan 10, 2024
1 parent 0ef65ed commit 6068c4a
Show file tree
Hide file tree
Showing 100 changed files with 28,335 additions and 0 deletions.
Binary file not shown.
File renamed without changes.
File renamed without changes.
154 changes: 154 additions & 0 deletions arachne-ee/TestRareBloodCancersPrevalence/cdm_from_environment.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@


cdm_from_environment <- function(write_prefix = "") {

connection_string <- Sys.getenv("CONNECTION_STRING")

if (connection_string == "") {
stop("CONNECTION_STRING environment variable not set.")
}

parts <- stringr::str_split(connection_string, ":")[[1]]
port_db <- stringr::str_split(parts[4], "/")[[1]]

# required args that need to be passed in
dbms <- Sys.getenv("DBMS_TYPE")
server <- stringr::str_remove(parts[3], "^//+")
port <- port_db[1]
dbname <- port_db[2]
username <- Sys.getenv("DBMS_USERNAME")
password <- Sys.getenv("DBMS_PASSWORD")
cdm_schema <- Sys.getenv("DBMS_SCHEMA")
write_schema <- Sys.getenv("RESULT_SCHEMA")


vars <- c(dbms, server, port, dbname, password, cdm_schema, write_schema)
names_vars <- c("dbms", "server", "port", "dbname", "password", "cdm_schema", "write_schema")

for (i in seq_along(vars)) {
if (nchar(vars[i]) < 1) stop(paste(names_vars[i], "is not required but not available!"))
}

supported_db <- c("postgresql", "sql server", "redshift", "duckdb", "snowflake")

if (!(dbms %in% supported_db)) {
cli::cli_abort("The environment variable DBMS_TYPE must be on one of {paste(supported_db, collapse = ', ')} not `{Sys.getenv('DBMS_TYPE')}`.")
}

if (dbms == "duckdb") {
db <- dbname
if (db == "") {
db <- "GiBleed"
}

checkmate::assert_choice(db, CDMConnector::example_datasets())
con <- DBI::dbConnect(duckdb::duckdb(), CDMConnector::eunomia_dir(db))
cdm <- CDMConnector::cdm_from_con(con, "main", "main", cdm_version = "5.3", cdm_name = db)
return(cdm)
}

cdm_schema <- stringr::str_split(cdm_schema, "\\.")[[1]]
write_schema <- stringr::str_split(write_schema, "\\.")[[1]]


print(write_schema)
print(sapply(write_schema, nchar))

if (dbms %in% c("postgresql", "redshift")) {

drv <- switch (dbms,
"postgresql" = RPostgres::Postgres(),
"redshift" = RPostgres::Redshift()
)

con <- DBI::dbConnect(drv = drv,
dbname = dbname,
host = server,
user = username,
password = password,
port = port)

if (!DBI::dbIsValid(con)) {
cli::cli_abort("Database connection failed!")
}

} else if (dbms == "sql server") {

con <- DBI::dbConnect(odbc::odbc(),
Driver = "ODBC Driver 17 for SQL Server",
Server = server,
Database = dbname,
UID = username,
PWD = password,
TrustServerCertificate="yes",
Port = port)

if (!DBI::dbIsValid(con)) {
cli::cli_abort("Database connection failed!")
}


} else if (dbms == "snowflake") {
con <- DBI::dbConnect(odbc::odbc(),
DRIVER = "SnowflakeDSIIDriver",
SERVER = server,
DATABASE = dbname,
UID = username,
PWD = password,
WAREHOUSE = "COMPUTE_WH_XS") # don't hardcode this

if (!DBI::dbIsValid(con)) {
cli::cli_abort("Database connection failed!")
}

} else {
cli::cli_abort("{dbms} is not a supported database type!")
}

# split schemas. If write schema has a dot we need to interpret it as catalog.schema
# cdm schema should not have a dot

# if (stringr::str_detect(Sys.getenv("WRITE_SCHEMA"), "\\.")) {
# write_schema <- stringr::str_split(write_schema, "\\.")[[1]]
# if (length(write_schema) != 2) {
# cli::cli_abort("write_schema can have at most one period (.)!")
# }
#
# stopifnot(nchar(write_schema[1]) > 0, nchar(write_schema[2]) > 0)
# write_schema <- c(catalog = write_schema[1], schema = write_schema[2])
# } else {
# write_schema <- c(schema = Sys.getenv("WRITE_SCHEMA"))
# }
#
# if (write_prefix != "") {
# if (dbms != "snowflake") {
# write_schema <- c(write_schema, prefix = write_prefix)
# }
# }

# add prefix
if (write_prefix != "") {
if (length(write_schema) == 1) {
write_schema <- c(schema = write_schema, prefix = write_prefix)
} else if (length(write_schema) == 2) {
write_schema <- c(catalog = write_schema[1],
schema = write_schema[2],
prefix = write_prefix)
}
} else if (length(write_schema) == 2) {
write_schema <- c(catalog = write_schema[1], schema = write_schema[2])
}

cdm <- CDMConnector::cdm_from_con(
con = con,
cdm_schema = cdm_schema,
write_schema = write_schema,
cdm_version = "5.3",
cdm_name = Sys.getenv("DATA_SOURCE_NAME", unset = "unnamed_cdm"))

if (length(names(cdm)) == 0) {
cli::cli_abort("CDM object creation failed!")
}

return(cdm)
}
168 changes: 168 additions & 0 deletions arachne-ee/TestRareBloodCancersPrevalence/main.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
library(CDMConnector)
library(IncidencePrevalence)
library(dplyr)
library(log4r)
setwd(file.path(getwd(), "TestRareBloodCancersPrevalence"))
print(getwd())
print(list.files())
# source("arachne-ee/TestRareBloodCancersPrevalence/cdm_from_environment.R")
source("cdm_from_environment.R")

# database metadata and connection details -----
# The name/ acronym for the database
db_name <- Sys.getenv("DATA_SOURCE_NAME") %>%
tolower() %>%
stringr::str_replace_all(" ", "_") %>%
stringr::str_replace_all("\\(|\\)", "")

# Set output folder location -----
# the path to a folder where the results from this analysis will be saved
# output_folder <- here::here("results") # does not work with execution engine
output_folder <- "/results"

# create cdm reference ----
# con <- DBI::dbConnect(duckdb::duckdb(), eunomia_dir())
# cdm <- cdm_from_con(con, "main", "main")

# cdm <- cdm_from_environment(write_prefix = "dw_")
prefix <- paste0("tmp", as.integer(Sys.time()) %% 1000, "_")
cdm <- cdm_from_environment(prefix)

# check database connection
# running the next line should give you a count of your person table
n <- cdm$person %>%
tally() %>%
pull(n)

print(paste(n, "persons in the CDM person table"))

print(paste("Database connection class:", class(attr(cdm, "dbcon"))))

# create directory if it does not already exist ----
if (!file.exists(output_folder)) {
dir.create(output_folder, recursive = TRUE)
}

# start log ----
log_file <- paste0(output_folder, "/log.txt")
logger <- create.logger()
logfile(logger) <- log_file
level(logger) <- "INFO"

# tables ---
table_outcome <-"outcome"
table_dpop_sex <- "dpop_sex"
table_ph <- "dpop_ph"
table_age <- "dpop_age"
table_point_prev <- "point_prev"
table_period_prev <- "period_prev"

# instantiate outcome cohorts ----
info(logger, "INSTANTIATE OUTCOME COHORTS")

outcome_cohorts <- readCohortSet(here::here("outcomeCohorts")) %>%
slice(1:2) # for testing

info(logger, "- getting outcomes")

cdm <- generateCohortSet(cdm,
outcome_cohorts,
name = table_outcome,
overwrite = TRUE)

total_subjects <- cohort_count(cdm[[table_outcome]]) %>%
summarise(total = sum(number_subjects)) %>%
pull(total)

print(paste("total subjects with outcomes:", total_subjects))

# get denominator cohorts -----
info(logger, "GETTING DENOMINATOR COHORTS")
info(logger, "- getting denominator - primary and sex")

cdm <- generateDenominatorCohortSet(
cdm = cdm,
name = table_dpop_sex,
cohortDateRange = c(as.Date("2010-01-01"), as.Date("2024-01-01")),
sex = c("Male", "Female", "Both"),
daysPriorObservation = 365,
overwrite = TRUE
)

info(logger, "- getting denominator - prior history")

cdm <- generateDenominatorCohortSet(
cdm = cdm,
name = table_ph,
cohortDateRange = c(as.Date("2010-01-01"), as.Date("2024-01-01")),
daysPriorObservation = c(0, 1095),
overwrite = TRUE
)

info(logger, "- getting denominator - age_gr")

cdm <- generateDenominatorCohortSet(
cdm = cdm,
name = table_age,
cohortDateRange = c(as.Date("2010-01-01"), as.Date("2024-01-01")),
ageGroup = list(
# age_gr_1
c(0, 9), c(10, 19), c(20, 29), c(30, 39), c(40, 49),
c(50, 59), c(60, 69), c(70, 79), c(80, 89),
c(90, 99), c(100, 150),
# age_gr_2
c(0, 44), c(45, 64), c(65, 150)
),
daysPriorObservation = 365,
overwrite = TRUE
)

# estimate prevalence -----
denominators <- c(
table_dpop_sex,
table_ph,
table_age
)

prevalence_estimates <- list()

for (i in seq_along(denominators)) {

info(logger, paste0("- getting point prevalence for ", denominators[i]))
# debugonce(estimatePointPrevalence)
prevalence_estimates[[paste0("point_prevalence_", denominators[[i]])]] <- estimatePointPrevalence(
cdm = cdm,
denominatorTable = denominators[i],
outcomeTable = table_outcome,
interval = "years",
temporary = FALSE
)

info(logger, paste0("- getting period prevalence for ", denominators[i]))


prevalence_estimates[[paste0("period_prevalence_", denominators[[i]])]] <- estimatePeriodPrevalence(
cdm = cdm,
denominatorTable = denominators[i],
outcomeTable = table_outcome,
completeDatabaseIntervals = TRUE,
fullContribution = c(TRUE, FALSE),
interval = "years",
temporary = FALSE
)
}

# gather results and export -----
info(logger, "ZIPPING RESULTS")
exportIncidencePrevalenceResults(
result = prevalence_estimates,
zipName = paste0(c(db_name, "C1_001_Results", format(Sys.Date(), format="%Y%m%d")), collapse = "_"),
outputFolder = output_folder
)

cdm_disconnect(cdm)

print("-- Thank you for running the study!")
print("-- If all has worked, there should now be a zip folder with your results in the output folder to share")


24 changes: 24 additions & 0 deletions create_zipfiles.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@


testnames <- list.files(here::here("arachne-ee"))

for (i in seq_along(testnames)) {
message(paste("writing ", testnames[i]))
withr::with_dir(here::here("arachne-ee"), {
filenames <- list.files(testnames[i], full.names = T)
zip(zipfile = paste0("../arachne-ee-zip/", testnames[i]), files = filenames)
})
}




testnames <- list.files(here::here("darwin-ee"))

for (i in seq_along(testnames)) {
message(paste("writing ", testnames[i]))
withr::with_dir(here::here("arachne-ee"), {
filenames <- list.files(testnames[i], full.names = T)
zip(zipfile = paste0("../darwin-ee-zip/", testnames[i]), files = filenames)
})
}
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
10 changes: 10 additions & 0 deletions darwin-ee/Untitled/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata
postgresql-42.2.18.jar
CodeToRun_local.R
Results/CPRD GOLD/*
*.zip
*log.txt
.Rprofile
Loading

0 comments on commit 6068c4a

Please sign in to comment.