From 48e3644670dc75b891b74f61a9f94db7313da0ae Mon Sep 17 00:00:00 2001 From: egillax Date: Sat, 26 Oct 2024 11:21:11 +0200 Subject: [PATCH 01/15] try new action for broken links --- .github/workflows/pkgdown.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 7a5e8ac76..0e422494c 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -31,6 +31,8 @@ jobs: extra-packages: any::pkgdown, ohdsi/OhdsiRTools needs: website + - uses: gaurav-nelson/github-action-markdown-link-check@v1 + - name: Build site run: Rscript -e 'pkgdown::build_site_github_pages(new_process = FALSE, install = TRUE)' From f3ba9bd7b9fb28a74e80fd2dc6ebd208c126ff72 Mon Sep 17 00:00:00 2001 From: egillax Date: Sat, 26 Oct 2024 11:22:28 +0200 Subject: [PATCH 02/15] add feature branch in trigger --- .github/workflows/pkgdown.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 0e422494c..e8bcc9ce9 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -2,7 +2,7 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, develop] + branches: [main, develop, broken_links] release: types: [published] workflow_dispatch: From dae188e6d81993ba3ae28869130b13056576b4d3 Mon Sep 17 00:00:00 2001 From: egillax Date: Sat, 26 Oct 2024 11:34:11 +0200 Subject: [PATCH 03/15] check R markdown as well --- .github/workflows/pkgdown.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index e8bcc9ce9..1d8230dd4 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -32,6 +32,8 @@ jobs: needs: website - uses: gaurav-nelson/github-action-markdown-link-check@v1 + with: + file-extension: 'md, rmd, Rmd' - name: Build site run: Rscript -e 'pkgdown::build_site_github_pages(new_process = FALSE, install = TRUE)' From 706509eb847c8b8b9ad6e40885e38cb48e624e2f Mon Sep 17 00:00:00 2001 From: egillax Date: Sat, 26 Oct 2024 11:56:03 +0200 Subject: [PATCH 04/15] try lychee action --- .github/workflows/pkgdown.yaml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 1d8230dd4..a7b34ac64 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -28,15 +28,13 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: + cache: always extra-packages: any::pkgdown, ohdsi/OhdsiRTools needs: website - - uses: gaurav-nelson/github-action-markdown-link-check@v1 + - uses: lycheeverse/lychee-action@v2 with: - file-extension: 'md, rmd, Rmd' - - - name: Build site - run: Rscript -e 'pkgdown::build_site_github_pages(new_process = FALSE, install = TRUE)' + args: --base . --verbose --no-progress './**/*.md' './**/*.rmd' './**/*.Rmd' - name: Fix Hades Logo run: Rscript -e 'OhdsiRTools::fixHadesLogo()' From 02342b906ee7984f87e9456acb9a766e9d9eed23 Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 28 Oct 2024 09:33:27 +0100 Subject: [PATCH 05/15] accept 403 response --- .github/workflows/pkgdown.yaml | 2 +- README.md | 3 +- vignettes/BestPractices.rmd | 291 --------------------------------- vignettes/ClinicalModels.rmd | 46 ------ vignettes/Videos.rmd | 67 -------- 5 files changed, 2 insertions(+), 407 deletions(-) delete mode 100644 vignettes/BestPractices.rmd delete mode 100644 vignettes/ClinicalModels.rmd delete mode 100644 vignettes/Videos.rmd diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index a7b34ac64..446629a66 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -34,7 +34,7 @@ jobs: - uses: lycheeverse/lychee-action@v2 with: - args: --base . --verbose --no-progress './**/*.md' './**/*.rmd' './**/*.Rmd' + args: --base . --verbose --no-progress --accept '100..=103, 200..=299, 403' './**/*.md' './**/*.Rmd' - name: Fix Hades Logo run: Rscript -e 'OhdsiRTools::fixHadesLogo()' diff --git a/README.md b/README.md index aa08c3772..f70f8b13c 100644 --- a/README.md +++ b/README.md @@ -98,8 +98,7 @@ Please read the main vignette for the package: In addition we have created vignettes that describe advanced functionality in more detail: - [Building Multiple Patient-Level Predictive Models](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingMultiplePredictiveModels.pdf) -- [Implementing Existing Patient-Level Predictive Models](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/ImplementingExistingModels.pdf) -- [Adding Custom Machine Learning Algorithms](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomAlgorithms.pdf) +- [Adding Custom Machine Learning Algorithms](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomModels.pdf) - [Building Deep Learning Models](https://github.com/OHDSI/DeepPatientLevelPrediction) - [Building Ensemble Models](https://github.com/OHDSI/EnsemblePatientLevelPrediction) - [Creating Learning Curves](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/CreatingLearningCurves.pdf) diff --git a/vignettes/BestPractices.rmd b/vignettes/BestPractices.rmd deleted file mode 100644 index 1fba21806..000000000 --- a/vignettes/BestPractices.rmd +++ /dev/null @@ -1,291 +0,0 @@ ---- -title: "Best Practice Research" -author: "Jenna Reps, Peter R. Rijnbeek" -date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[CO,CE]{Installation Guide} - - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[LE,RO]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - includes: - in_header: preamble.tex - number_sections: yes - toc: yes - word_document: - toc: yes - html_document: - number_sections: yes - toc: yes ---- - -```{=html} - -``` -## Best practice publications using the OHDSI PatientLevelPrediction framework - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Topic - -Research Summary - -Link -
-Problem Specification - -When is prediction suitable in observational data? - -Guidelines needed -
-Data Creation - -Comparison of cohort vs case-control design - -Journal of Big Data -
-Data Creation - -Addressing loss to follow-up (right censoring) - -BMC medical informatics and decision makingk -
-Data Creation - -Investigating how to address left censoring in features construction - -BMC Medical Research Methodology -
-Data Creation - -Impact of over/under-sampling - - Journal of big data -
-Data Creation - -Impact of phenotypes - -Study Done - Paper submitted -
-Model development - -How much data do we need for prediction - Learning curves at scale - -International Journal of Medical Informatics -
-Model development - -What impact does test/train/validation design have on model performance - -BMJ Open -
-Model development - -What is the impact of the classifier - -JAMIA -
-Model development - -Can we find hyper-parameter combinations per classifier that consistently lead to good performing models when using claims/EHR data? - -Study needs to be done -
-Model development - -Can we use ensembles to combine different algorithm models within a database to improve models transportability? - - Caring is Sharing–Exploiting the Value in Data for Health and Innovation -
-Model development - -Can we use ensembles to combine models developed using different databases to improve models transportability? - - BMC Medical Informatics and Decision Making -
-Model development - -Impact of regularization method - - JAMIA -
-Evaluation - -Why prediction is not suitable for risk factor identification - - Machine Learning for Healthcare Conference -
-Evaluation - -Iterative pairwise external validation to put validation into context - - Drug Safety -
-Evaluation - -A novel method to estimate external validation using aggregate statistics - - Study under review -
-Evaluation - -How should we present model performance? (e.g., new visualizations) - -JAMIA Open -
-Evaluation - -How to interpret external validation performance (can we figure out why the performance drops or stays consistent)? - -Study needs to be done -
-Evaluation - -Recalibration methods - -Study needs to be done -
-Evaluation - -Is there a way to automatically simplify models? - -Study protocol under development -
- diff --git a/vignettes/ClinicalModels.rmd b/vignettes/ClinicalModels.rmd deleted file mode 100644 index 3b6a5e5ae..000000000 --- a/vignettes/ClinicalModels.rmd +++ /dev/null @@ -1,46 +0,0 @@ ---- -title: "Clinical Models" -author: "Jenna Reps, Peter R. Rijnbeek" -date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[CO,CE]{Installation Guide} - - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[LE,RO]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - includes: - in_header: preamble.tex - number_sections: yes - toc: yes - word_document: - toc: yes - html_document: - number_sections: yes - toc: yes ---- - -```{=html} - -``` - -## Clinical models developed using the OHDSI PatientLevelPrediction framework - -| Title | Link | -|----------------------|-------| -| Using Machine Learning Applied to Real-World Healthcare Data for Predictive Analytics: An Applied Example in Bariatric Surgery | [Value in Health](https://www.sciencedirect.com/science/article/pii/S1098301519300737) | -| Development and validation of a prognostic model predicting symptomatic hemorrhagic transformation in acute ischemic stroke at scale in the OHDSI network | [PLoS One](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0226718) | -| Wisdom of the CROUD: development and validation of a patient-level prediction model for opioid use disorder using population-level claims data | [PLoS One](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0228632) | -| Developing predictive models to determine Patients in End-of-life Care in Administrative datasets | [Drug Safety](https://link.springer.com/article/10.1007/s40264-020-00906-7) | -| Predictors of diagnostic transition from major depressive disorder to bipolar disorder: a retrospective observational network study | [Translational psychiatry](https://www.nature.com/articles/s41398-021-01760-6) | -| Seek COVER: using a disease proxy to rapidly develop and validate a personalized risk calculator for COVID-19 outcomes in an international network | [BMC Medical Research Methodology](https://link.springer.com/article/10.1186/s12874-022-01505-z) | -| 90-Day all-cause mortality can be predicted following a total knee replacement: an international, network study to develop and validate a prediction model | [Knee Surgery, Sports Traumatology, Arthroscopy](https://link.springer.com/article/10.1007/s00167-021-06799-y) | -| Machine learning and real-world data to predict lung cancer risk in routine care | [Cancer Epidemiology, Biomarkers & Prevention](https://aacrjournals.org/cebp/article-abstract/32/3/337/718495) | -| Development and validation of a patient-level model to predict dementia across a network of observational databases | [BMC medicine](https://link.springer.com/article/10.1186/s12916-024-03530-9) | \ No newline at end of file diff --git a/vignettes/Videos.rmd b/vignettes/Videos.rmd deleted file mode 100644 index 5f278c92b..000000000 --- a/vignettes/Videos.rmd +++ /dev/null @@ -1,67 +0,0 @@ ---- -title: "Demo Videos" -author: "Jenna Reps, Peter R. Rijnbeek" -date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[CO,CE]{Installation Guide} - - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[LE,RO]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - includes: - in_header: preamble.tex - number_sections: yes - toc: yes - word_document: - toc: yes - html_document: - number_sections: yes - toc: yes ---- - -```{=html} - -``` -## What is a cohort table? - -| Click To Launch | Description of Demo | -|-----------------------------------------------|------------------------| -| [![Video Vignette PLP Package](http://img.youtube.com/vi/BEukCbT8UoA/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/GY2ZTcizY90) | Learn what a cohort table looks like and what columns are required. | - -## Setting up a connection between your database and R - -| Click To Launch | Description of Demo | -|----------------------------------------|--------------------------------| -| [![Video Vignette PLP Package](http://img.youtube.com/vi/BEukCbT8UoA/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/8F2X5SKN64w) | Learn how to configure the connection to your OMOP CDM data from R using the OHDSI DatabaseConnector package. | - -## Running a single PatientLevelPrediction model - -| Click To Launch | Description of Demo | -|-----------------------------------------------|-------------------------| -| [![Video Vignette PLP Package](http://img.youtube.com/vi/7AraOsTynD4/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/7AraOsTynD4) | Learn how to develop and validate a single PatientLevelPrediction model. | - -## Running multiple PatientLevelPrediction models study - -| Click To Launch | Description of Demo | -|-----------------------------------------------|-------------------------| -| [![Video Vignette PLP Package](http://img.youtube.com/vi/7wUilx580PE/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/7wUilx580PE) | Learn how to develop and validate multiple PatientLevelPrediction models. | - -## Exploring the results in the shiny app - -| Click To Launch | Description of Demo | -|---------------------------------------|---------------------------------| -| [![Video Vignette PLP Package](http://img.youtube.com/vi/BulmuH32y_Y/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/BulmuH32y_Y) | Learn how to interactively explore the model performance and model via the shiny apps viewPlp() and viewMultiplePlp() | - -## Validating existing models on OMOP CDM data - -| Click To Launch | Description of Demo | -|--------------------------|----------------------------------------------| -| [![Video Vignette PLP Package](http://img.youtube.com/vi/BEukCbT8UoA/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/oBsfg9hfrpI) | This demo shows how you can add any existing score or logistic model and validate the model on new OMOP CDM data. This is useful for benchmarking when developing new models or to perform extensive external validation of a model across the OHDSI network. | From 402548dd70ce91b7d59c29d2b9109a7803899e40 Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 28 Oct 2024 09:42:20 +0100 Subject: [PATCH 06/15] fix file extensions --- vignettes/BestPractices.Rmd | 291 +++++++++++++++++++++++++++++++++++ vignettes/ClinicalModels.Rmd | 46 ++++++ vignettes/Videos.Rmd | 67 ++++++++ 3 files changed, 404 insertions(+) create mode 100644 vignettes/BestPractices.Rmd create mode 100644 vignettes/ClinicalModels.Rmd create mode 100644 vignettes/Videos.Rmd diff --git a/vignettes/BestPractices.Rmd b/vignettes/BestPractices.Rmd new file mode 100644 index 000000000..1fba21806 --- /dev/null +++ b/vignettes/BestPractices.Rmd @@ -0,0 +1,291 @@ +--- +title: "Best Practice Research" +author: "Jenna Reps, Peter R. Rijnbeek" +date: '`r Sys.Date()`' +header-includes: + - \usepackage{fancyhdr} + - \pagestyle{fancy} + - \fancyhead{} + - \fancyhead[CO,CE]{Installation Guide} + - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} + - \fancyfoot[LE,RO]{\thepage} + - \renewcommand{\headrulewidth}{0.4pt} + - \renewcommand{\footrulewidth}{0.4pt} +output: + pdf_document: + includes: + in_header: preamble.tex + number_sections: yes + toc: yes + word_document: + toc: yes + html_document: + number_sections: yes + toc: yes +--- + +```{=html} + +``` +## Best practice publications using the OHDSI PatientLevelPrediction framework + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+Topic + +Research Summary + +Link +
+Problem Specification + +When is prediction suitable in observational data? + +Guidelines needed +
+Data Creation + +Comparison of cohort vs case-control design + +Journal of Big Data +
+Data Creation + +Addressing loss to follow-up (right censoring) + +BMC medical informatics and decision makingk +
+Data Creation + +Investigating how to address left censoring in features construction + +BMC Medical Research Methodology +
+Data Creation + +Impact of over/under-sampling + + Journal of big data +
+Data Creation + +Impact of phenotypes + +Study Done - Paper submitted +
+Model development + +How much data do we need for prediction - Learning curves at scale + +International Journal of Medical Informatics +
+Model development + +What impact does test/train/validation design have on model performance + +BMJ Open +
+Model development + +What is the impact of the classifier + +JAMIA +
+Model development + +Can we find hyper-parameter combinations per classifier that consistently lead to good performing models when using claims/EHR data? + +Study needs to be done +
+Model development + +Can we use ensembles to combine different algorithm models within a database to improve models transportability? + + Caring is Sharing–Exploiting the Value in Data for Health and Innovation +
+Model development + +Can we use ensembles to combine models developed using different databases to improve models transportability? + + BMC Medical Informatics and Decision Making +
+Model development + +Impact of regularization method + + JAMIA +
+Evaluation + +Why prediction is not suitable for risk factor identification + + Machine Learning for Healthcare Conference +
+Evaluation + +Iterative pairwise external validation to put validation into context + + Drug Safety +
+Evaluation + +A novel method to estimate external validation using aggregate statistics + + Study under review +
+Evaluation + +How should we present model performance? (e.g., new visualizations) + +JAMIA Open +
+Evaluation + +How to interpret external validation performance (can we figure out why the performance drops or stays consistent)? + +Study needs to be done +
+Evaluation + +Recalibration methods + +Study needs to be done +
+Evaluation + +Is there a way to automatically simplify models? + +Study protocol under development +
+ diff --git a/vignettes/ClinicalModels.Rmd b/vignettes/ClinicalModels.Rmd new file mode 100644 index 000000000..3b6a5e5ae --- /dev/null +++ b/vignettes/ClinicalModels.Rmd @@ -0,0 +1,46 @@ +--- +title: "Clinical Models" +author: "Jenna Reps, Peter R. Rijnbeek" +date: '`r Sys.Date()`' +header-includes: + - \usepackage{fancyhdr} + - \pagestyle{fancy} + - \fancyhead{} + - \fancyhead[CO,CE]{Installation Guide} + - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} + - \fancyfoot[LE,RO]{\thepage} + - \renewcommand{\headrulewidth}{0.4pt} + - \renewcommand{\footrulewidth}{0.4pt} +output: + pdf_document: + includes: + in_header: preamble.tex + number_sections: yes + toc: yes + word_document: + toc: yes + html_document: + number_sections: yes + toc: yes +--- + +```{=html} + +``` + +## Clinical models developed using the OHDSI PatientLevelPrediction framework + +| Title | Link | +|----------------------|-------| +| Using Machine Learning Applied to Real-World Healthcare Data for Predictive Analytics: An Applied Example in Bariatric Surgery | [Value in Health](https://www.sciencedirect.com/science/article/pii/S1098301519300737) | +| Development and validation of a prognostic model predicting symptomatic hemorrhagic transformation in acute ischemic stroke at scale in the OHDSI network | [PLoS One](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0226718) | +| Wisdom of the CROUD: development and validation of a patient-level prediction model for opioid use disorder using population-level claims data | [PLoS One](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0228632) | +| Developing predictive models to determine Patients in End-of-life Care in Administrative datasets | [Drug Safety](https://link.springer.com/article/10.1007/s40264-020-00906-7) | +| Predictors of diagnostic transition from major depressive disorder to bipolar disorder: a retrospective observational network study | [Translational psychiatry](https://www.nature.com/articles/s41398-021-01760-6) | +| Seek COVER: using a disease proxy to rapidly develop and validate a personalized risk calculator for COVID-19 outcomes in an international network | [BMC Medical Research Methodology](https://link.springer.com/article/10.1186/s12874-022-01505-z) | +| 90-Day all-cause mortality can be predicted following a total knee replacement: an international, network study to develop and validate a prediction model | [Knee Surgery, Sports Traumatology, Arthroscopy](https://link.springer.com/article/10.1007/s00167-021-06799-y) | +| Machine learning and real-world data to predict lung cancer risk in routine care | [Cancer Epidemiology, Biomarkers & Prevention](https://aacrjournals.org/cebp/article-abstract/32/3/337/718495) | +| Development and validation of a patient-level model to predict dementia across a network of observational databases | [BMC medicine](https://link.springer.com/article/10.1186/s12916-024-03530-9) | \ No newline at end of file diff --git a/vignettes/Videos.Rmd b/vignettes/Videos.Rmd new file mode 100644 index 000000000..5f278c92b --- /dev/null +++ b/vignettes/Videos.Rmd @@ -0,0 +1,67 @@ +--- +title: "Demo Videos" +author: "Jenna Reps, Peter R. Rijnbeek" +date: '`r Sys.Date()`' +header-includes: + - \usepackage{fancyhdr} + - \pagestyle{fancy} + - \fancyhead{} + - \fancyhead[CO,CE]{Installation Guide} + - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} + - \fancyfoot[LE,RO]{\thepage} + - \renewcommand{\headrulewidth}{0.4pt} + - \renewcommand{\footrulewidth}{0.4pt} +output: + pdf_document: + includes: + in_header: preamble.tex + number_sections: yes + toc: yes + word_document: + toc: yes + html_document: + number_sections: yes + toc: yes +--- + +```{=html} + +``` +## What is a cohort table? + +| Click To Launch | Description of Demo | +|-----------------------------------------------|------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/BEukCbT8UoA/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/GY2ZTcizY90) | Learn what a cohort table looks like and what columns are required. | + +## Setting up a connection between your database and R + +| Click To Launch | Description of Demo | +|----------------------------------------|--------------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/BEukCbT8UoA/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/8F2X5SKN64w) | Learn how to configure the connection to your OMOP CDM data from R using the OHDSI DatabaseConnector package. | + +## Running a single PatientLevelPrediction model + +| Click To Launch | Description of Demo | +|-----------------------------------------------|-------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/7AraOsTynD4/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/7AraOsTynD4) | Learn how to develop and validate a single PatientLevelPrediction model. | + +## Running multiple PatientLevelPrediction models study + +| Click To Launch | Description of Demo | +|-----------------------------------------------|-------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/7wUilx580PE/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/7wUilx580PE) | Learn how to develop and validate multiple PatientLevelPrediction models. | + +## Exploring the results in the shiny app + +| Click To Launch | Description of Demo | +|---------------------------------------|---------------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/BulmuH32y_Y/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/BulmuH32y_Y) | Learn how to interactively explore the model performance and model via the shiny apps viewPlp() and viewMultiplePlp() | + +## Validating existing models on OMOP CDM data + +| Click To Launch | Description of Demo | +|--------------------------|----------------------------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/BEukCbT8UoA/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/oBsfg9hfrpI) | This demo shows how you can add any existing score or logistic model and validate the model on new OMOP CDM data. This is useful for benchmarking when developing new models or to perform extensive external validation of a model across the OHDSI network. | From 7ed220b7ef05a6b04670d0d4f0622132077284fa Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 28 Oct 2024 10:05:08 +0100 Subject: [PATCH 07/15] fix links in vignette --- vignettes/BuildingPredictiveModels.Rmd | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vignettes/BuildingPredictiveModels.Rmd b/vignettes/BuildingPredictiveModels.Rmd index d805c3e1c..e62cf10c0 100644 --- a/vignettes/BuildingPredictiveModels.Rmd +++ b/vignettes/BuildingPredictiveModels.Rmd @@ -1236,8 +1236,7 @@ The package has much more functionality than described in this vignette and cont | Custom Splitting Functions | This vignette describes how you can add your own custom training/validation/testing splitting functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSplitting.pdf) | | Custom Sampling Functions | This vignette describes how you can add your own custom sampling functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSamples.pdf) | | Custom Feature Engineering/Selection | This vignette describes how you can add your own custom feature engineering and selection functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomFeatureEngineering.pdf) | -| Ensemble models | This vignette describes how you can use the framework to build ensemble models, i.e combine multiple models in a super learner | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingEnsembleModels.pdf) | -| Learning curves | Learning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below. | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/GeneratingLearningCurves.pdf) | +| Learning curves | Learning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below. | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/CreatingLearningCurves.pdf) | # Demos From b1f8386a48970b2fe3ee1786e42c9d1198f148d9 Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 28 Oct 2024 10:56:18 +0100 Subject: [PATCH 08/15] fix links in vignette and branch name --- vignettes/BuildingPredictiveModels.Rmd | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vignettes/BuildingPredictiveModels.Rmd b/vignettes/BuildingPredictiveModels.Rmd index e62cf10c0..8b210ee44 100644 --- a/vignettes/BuildingPredictiveModels.Rmd +++ b/vignettes/BuildingPredictiveModels.Rmd @@ -104,7 +104,7 @@ The final study population in which we will develop our model is often a subset ## Model development settings -To develop the model we have to decide which algorithm(s) we like to train. We see the selection of the best algorithm for a certain prediction problem as an empirical question, i.e. you need to let the data speak for itself and try different approaches to find the best one. There is no algorithm that will work best for all problems (no free lunch). In our package we therefore aim to implement many algorithms. Furthermore, we made the system modular so you can add your own custom algorithms as described in more detail in the [`AddingCustomModels`](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomModels.pdf) vignette. +To develop the model we have to decide which algorithm(s) we like to train. We see the selection of the best algorithm for a certain prediction problem as an empirical question, i.e. you need to let the data speak for itself and try different approaches to find the best one. There is no algorithm that will work best for all problems (no free lunch). In our package we therefore aim to implement many algorithms. Furthermore, we made the system modular so you can add your own custom algorithms as described in more detail in the [`AddingCustomModels`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomModels.pdf) vignette. Our package currently contains the following algorithms to choose from: @@ -460,7 +460,7 @@ In the PatientLevelPrediction package, the splitSettings define how the plpData ) ``` -Note: it is possible to add a custom method to specify how the plpData are partitioned into training/validation/testing data, see [vignette for custom splitting](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSplitting.pdf). +Note: it is possible to add a custom method to specify how the plpData are partitioned into training/validation/testing data, see [vignette for custom splitting](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSplitting.pdf). ### Preprocessing the training data @@ -472,7 +472,7 @@ The default sample settings does nothing, it simply returns the trainData as inp sampleSettings <- createSampleSettings() ``` -However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the `type` input should be 'underSample' and `numberOutcomestoNonOutcomes` must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see [vignette for custom sampling](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSamples.pdf). +However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the `type` input should be 'underSample' and `numberOutcomestoNonOutcomes` must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see [vignette for custom sampling](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSamples.pdf). It is possible to specify a combination of feature engineering functions that take as input the trainData and output a new trainData with different features. The default feature engineering setting does nothing: @@ -480,7 +480,7 @@ It is possible to specify a combination of feature engineering functions that ta featureEngineeringSettings <- createFeatureEngineeringSettings() ``` -However, it is possible to add custom feature engineering functions into the pipeline, see [vignette for custom feature engineering](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomFeatureEngineering.pdf). +However, it is possible to add custom feature engineering functions into the pipeline, see [vignette for custom feature engineering](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomFeatureEngineering.pdf). Finally, the preprocessing setting is required. For this setting the user can define `minFraction`, this removes any features that is observed in the training data for less than 0.01 fraction of the patients. So, if `minFraction = 0.01` then any feature that is seen in less than 1 percent of the target population is removed. The input `normalize` specifies whether the features are scaled between 0 and 1, this is required for certain models (e.g., LASSO logistic regression). The input `removeRedundancy` specifies whether features that are observed in all of the target population are removed. @@ -850,7 +850,7 @@ In the PatientLevelPrediction package, the splitSettings define how the plpData ) ``` -Note: it is possible to add a custom method to specify how the plpData are partitioned into training/validation/testing data, see [vignette for custom splitting](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSplitting.pdf). +Note: it is possible to add a custom method to specify how the plpData are partitioned into training/validation/testing data, see [vignette for custom splitting](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSplitting.pdf). ### Preprocessing the training data @@ -862,7 +862,7 @@ The default sample settings does nothing, it simply returns the trainData as inp sampleSettings <- createSampleSettings() ``` -However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the `type` input should be 'underSample' and `numberOutcomestoNonOutcomes` must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see [vignette for custom sampling](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSamples.pdf). +However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the `type` input should be 'underSample' and `numberOutcomestoNonOutcomes` must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see [vignette for custom sampling](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSamples.pdf). It is possible to specify a combination of feature engineering functions that take as input the trainData and output a new trainData with different features. The default feature engineering setting does nothing: @@ -870,7 +870,7 @@ It is possible to specify a combination of feature engineering functions that ta featureEngineeringSettings <- createFeatureEngineeringSettings() ``` -However, it is possible to add custom feature engineering functions into the pipeline, see [vignette for custom feature engineering](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomfeatureEngineering.pdf). +However, it is possible to add custom feature engineering functions into the pipeline, see [vignette for custom feature engineering](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomFeatureEngineering.pdf). Finally, the preprocessing setting is required. For this setting the user can define `minFraction`, this removes any features that is observed in the training data for less than 0.01 fraction of the patients. So, if `minFraction = 0.01` then any feature that is seen in less than 1 percent of the target population is removed. The input `normalize` specifies whether the features are scaled between 0 and 1, this is required for certain models (e.g., LASSO logistic regression). The input `removeRedundancy` specifies whether features that are observed in all of the target population are removed. From 19fe92c05ad2b4e5de0a78b14cad58bb2810c227 Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 28 Oct 2024 13:02:40 +0100 Subject: [PATCH 09/15] [WIP] working on main vignette --- .github/workflows/pkgdown.yaml | 4 +- vignettes/BuildingPredictiveModels.Rmd | 214 ++++++++++++------------- 2 files changed, 106 insertions(+), 112 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 446629a66..1aaf11b10 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -18,7 +18,7 @@ jobs: env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-pandoc@v2 @@ -41,7 +41,7 @@ jobs: - name: Deploy to GitHub pages 🚀 if: github.event_name != 'pull_request' - uses: JamesIves/github-pages-deploy-action@4.1.4 + uses: JamesIves/github-pages-deploy-action@v4 with: clean: false branch: gh-pages diff --git a/vignettes/BuildingPredictiveModels.Rmd b/vignettes/BuildingPredictiveModels.Rmd index 8b210ee44..4ed97cb75 100644 --- a/vignettes/BuildingPredictiveModels.Rmd +++ b/vignettes/BuildingPredictiveModels.Rmd @@ -24,12 +24,11 @@ output: toc: yes --- -```{=html} -``` + ```{r echo=FALSE,message=FALSE,warning=FALSE,eval=TRUE} library(PatientLevelPrediction) vignetteDataFolder <- "s:/temp/plpVignette" @@ -68,13 +67,13 @@ This vignette describes how you can use the `PatientLevelPrediction` package to We have to clearly specify our study upfront to be able to implement it. This means we need to define the prediction problem we like to address, in which population we will build the model, which model we will build and how we will evaluate its performance. To guide you through this process we will use a "Disease onset and progression" prediction type as an example. -## Problem definition 1: Stroke in afibrilation patients +## Problem definition 1: Stroke in atrial fibrilation patients Atrial fibrillation is a disease characterized by an irregular heart rate that can cause poor blood flow. Patients with atrial fibrillation are at increased risk of ischemic stroke. Anticoagulation is a recommended prophylaxis treatment strategy for patients at high risk of stroke, though the underuse of anticoagulants and persistent severity of ischemic stroke represents a substantial unmet medical need. Various strategies have been developed to predict risk of ischemic stroke in patients with atrial fibrillation. CHADS2 (Gage JAMA 2001) was developed as a risk score based on history of congestive heart failure, hypertension, age\>=75, diabetes and stroke. CHADS2 was initially derived using Medicare claims data, where it achieved good discrimination (AUC=0.82). However, subsequent external validation studies revealed the CHADS2 had substantially lower predictive accuracy (Keogh Thromb Haemost 2011). Subsequent stroke risk calculators have been developed and evaluated, including the extension of CHADS2Vasc. The management of atrial fibrillation has evolved substantially over the last decade, for various reasons that include the introduction of novel oral anticoagulants. With these innovations has come a renewed interest in greater precision medicine for stroke prevention. We will apply the PatientLevelPrediction package to observational healthcare data to address the following patient-level prediction question: -Amongst patients who are newly diagnosed with Atrial Fibrillation, which patients will go on to have Ischemic Stroke within 1 year? +Amongt patients who are newly diagnosed with Atrial Fibrillation, which patients will go on to have Ischemic Stroke within 1 year? We will define 'patients who are newly diagnosed with Atrial Fibrillation' as the first condition record of cardiac arrhythmia, which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. We will define 'Ischemic stroke events' as ischemic stroke condition records during an inpatient or ER visit; successive records with \> 180 day gap are considered independent episodes. @@ -84,7 +83,7 @@ Angiotensin converting enzyme inhibitors (ACE inhibitors) are medications used b We will apply the PatientLevelPrediction package to observational healthcare data to address the following patient-level prediction question: -Amongst patients who are newly dispensed an ACE inhibitor, which patients will go on to have angioedema within 1 year? +Amongt patients who are newly dispensed an ACE inhibitor, which patients will go on to have angioedema within 1 year? We will define 'patients who are newly dispensed an ACE inhibitor' as the first drug record of sny ACE inhibitor, [...]which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. We will define 'angioedema' as an angioedema condition record. @@ -100,7 +99,7 @@ The final study population in which we will develop our model is often a subset - *How do we define the period in which we will predict our outcome relative to the target cohort start?* We actually have to make two decisions to answer that question. First, does the time-at-risk window start at the date of the start of the target cohort or later? Arguments to make it start later could be that you want to avoid outcomes that were entered late in the record that actually occurred before the start of the target cohort or you want to leave a gap where interventions to prevent the outcome could theoretically be implemented. Second, you need to define the time-at-risk by setting the risk window end, as some specification of days offset relative to the target cohort start or end dates. For our problem we will predict in a ‘time-at-risk’ window starting 1 day after the start of the target cohort up to 365 days later (to look for 1-year risk following atrial fibrillation diagnosis). -- *Do we require a minimum amount of time-at-risk?* We have to decide if we want to include patients that did not experience the outcome but did leave the database earlier than the end of our time-at-risk period. These patients may experience the outcome when we do not observe them. For our prediction problem we decide to answer this question with ‘Yes, require a mimimum time-at-risk’ for that reason. Furthermore, we have to decide if this constraint also applies to persons who experienced the outcome or we will include all persons with the outcome irrespective of their total time at risk. For example, if the outcome is death, then persons with the outcome are likely censored before the full time-at-risk period is complete. +- *Do we require a minimum amount of time-at-risk?* We have to decide if we want to include patients that did not experience the outcome but did leave the database earlier than the end of our time-at-risk period. These patients may experience the outcome when we do not observe them. For our prediction problem we decide to answer this question with ‘Yes, require a minimum time-at-risk’ for that reason. Furthermore, we have to decide if this constraint also applies to persons who experienced the outcome or we will include all persons with the outcome irrespective of their total time at risk. For example, if the outcome is death, then persons with the outcome are likely censored before the full time-at-risk period is complete. ## Model development settings @@ -134,35 +133,35 @@ Finally, we have to define how we will train and test our model on our data, i.e We now completely defined our studies and implement them: -- [See example 1: Stroke in afibrilation patients](#example1) -- [See example 2: Agioedema in ACE inhibitor new users](#example2) +- [See example 1: Stroke in Atrial fibrillation patients](#example1) +- [See example 2: Angioedema in ACE inhibitor new users](#example2) -# Example 1: Stroke in afibrilation patients {#example1} +# Example 1: Stroke in Atrial fibrillation patients {#example1} ## Study Specification For our first prediction model we decide to start with a Regularized Logistic Regression and will use the default parameters. We will do a 75%-25% split by person. -| Definition | Value | -|-----------------|-------------------------------------------------------| -| **Problem Definition** | | -| Target Cohort (T) | 'Patients who are newly diagnosed with Atrial Fibrillation' defined as the first condition record of cardiac arrhythmia, which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. | -| Outcome Cohort (O) | 'Ischemic stroke events' defined as ischemic stroke condition records during an inpatient or ER visit; successive records with \> 180 day gap are considered independent episodes. | -| Time-at-risk (TAR) | 1 day till 365 days from cohort start | -| | | -| **Population Definition** | | -| Washout Period | 1095 | -| Enter the target cohort multiple times? | No | -| Allow prior outcomes? | Yes | -| Start of time-at-risk | 1 day | -| End of time-at-risk | 365 days | -| Require a minimum amount of time-at-risk? | Yes (364 days) | -| | | -| **Model Development** | | -| Algorithm | Regularized Logistic Regression | -| Hyper-parameters | variance = 0.01 (Default) | -| Covariates | Gender, Age, Conditions (ever before, \<365), Drugs Groups (ever before, \<365), and Visit Count | -| Data split | 75% train, 25% test. Randomly assigned by person | +| Definition | Value | +|------------------------------------|------------------------------------| +| **Problem Definition** | | +| Target Cohort (T) | 'Patients who are newly diagnosed with Atrial Fibrillation' defined as the first condition record of cardiac arrhythmia, which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. | +| Outcome Cohort (O) | 'Ischemic stroke events' defined as ischemic stroke condition records during an inpatient or ER visit; successive records with \> 180 day gap are considered independent episodes. | +| Time-at-risk (TAR) | 1 day till 365 days from cohort start | +| | | +| **Population Definition** | | +| Washout Period | 1095 | +| Enter the target cohort multiple times? | No | +| Allow prior outcomes? | Yes | +| Start of time-at-risk | 1 day | +| End of time-at-risk | 365 days | +| Require a minimum amount of time-at-risk? | Yes (364 days) | +| | | +| **Model Development** | | +| Algorithm | Regularized Logistic Regression | +| Hyper-parameters | variance = 0.01 (Default) | +| Covariates | Gender, Age, Conditions (ever before, \<365), Drugs Groups (ever before, \<365), and Visit Count | +| Data split | 75% train, 25% test. Randomly assigned by person | According to the best practices we need to make a protocol that completely specifies how we plan to execute our study. This protocol will be assessed by the governance boards of the participating data sources in your network study. For this a template could be used but we prefer to automate this process as much as possible by adding functionality to automatically generate study protocol from a study specification. We will discuss this in more detail later. @@ -197,8 +196,8 @@ ATLAS allows you to define cohorts interactively by specifying cohort entry and The T and O cohorts can be found here: -- Atrial Fibrillaton (T): -- Stroke (O) : + + In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages [(link)](http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas). @@ -218,10 +217,10 @@ File AfStrokeCohorts.sql Create a table to store the persons in the T and C cohort */ -IF OBJECT_ID('@resultsDatabaseSchema.PLPAFibStrokeCohort', 'U') IS NOT NULL -DROP TABLE @resultsDatabaseSchema.PLPAFibStrokeCohort; +IF OBJECT_ID('@cohortsDatabaseSchema.AFibStrokeCohort', 'U') IS NOT NULL +DROP TABLE @cohortsDatabaseSchema.AFibStrokeCohort; -CREATE TABLE @resultsDatabaseSchema.PLPAFibStrokeCohort +CREATE TABLE @cohortsDatabaseSchema.AFibStrokeCohort ( cohort_definition_id INT, subject_id BIGINT, @@ -238,7 +237,7 @@ any descendants, indexed at the first diagnosis - who have >1095 days of prior observation before their first diagnosis - and have no warfarin exposure any time prior to first AFib diagnosis */ -INSERT INTO @resultsDatabaseSchema.AFibStrokeCohort (cohort_definition_id, +INSERT INTO @cohortsDatabaseSchema.AFibStrokeCohort (cohort_definition_id, subject_id, cohort_start_date, cohort_end_date) @@ -280,7 +279,7 @@ FROM 'cerebral infarction' and descendants, 'cerebral thrombosis', 'cerebral embolism', 'cerebral artery occlusion' */ - INSERT INTO @resultsDatabaseSchema.AFibStrokeCohort (cohort_definition_id, + INSERT INTO @cohortsDatabaseSchema.AFibStrokeCohort (cohort_definition_id, subject_id, cohort_start_date, cohort_end_date) @@ -312,30 +311,29 @@ FROM This is parameterized SQL which can be used by the [`SqlRender`](http://github.com/OHDSI/SqlRender) package. We use parameterized SQL so we do not have to pre-specify the names of the CDM and result schemas. That way, if we want to run the SQL on a different schema, we only need to change the parameter values; we do not have to change the SQL code. By also making use of translation functionality in `SqlRender`, we can make sure the SQL code can be run in many different environments. -To execute this sql against our CDM we first need to tell R how to connect to the server. `PatientLevelPrediction` uses the [`DatabaseConnector`](http://github.com/ohdsi/DatabaseConnector) package, which provides a function called `createConnectionDetails`. Type `?createConnectionDetails` for the specific settings required for the various database management systems (DBMS). For example, one might connect to a PostgreSQL database using this code: +To execute this `sql` against our CDM we first need to tell R how to connect to the server. `PatientLevelPrediction` uses the [`DatabaseConnector`](http://github.com/ohdsi/DatabaseConnector) package, which provides a function called `createConnectionDetails()`. Type `?createConnectionDetails` for the specific settings required for the various database management systems (DBMS). For example, one might connect to a PostgreSQL database using this code: ```{r tidy=FALSE,eval=FALSE} + library(DatabaseConnector) connectionDetails <- createConnectionDetails(dbms = "postgresql", server = "localhost/ohdsi", user = "joe", password = "supersecret") - cdmDatabaseSchema <- "my_cdm_data" - cohortsDatabaseSchema <- "my_results" + cdmDatabaseSchema <- "cdm" + cohortsDatabaseSchema <- "cohorts" cdmVersion <- "5" ``` -The last three lines define the `cdmDatabaseSchema` and `cohortsDatabaseSchema` variables, as well as the CDM version. We will use these later to tell R where the data in CDM format live, where we want to create the cohorts of interest, and what version CDM is used. Note that for Microsoft SQL Server, databaseschemas need to specify both the database and the schema, so for example `cdmDatabaseSchema <- "my_cdm_data.dbo"`. +The last three lines define the `cdmDatabaseSchema` and `cohortsDatabaseSchema` variables, as well as the CDM version. We will use these later to tell R where the data in CDM format live, where we want to create the cohorts of interest, and what version CDM is used. Note that for Microsoft SQL Server, you need to specify both the database and the schema, so for example `cdmDatabaseSchema <- "my_cdm_data.dbo"`. ```{r tidy=FALSE,eval=FALSE} library(SqlRender) sql <- readSql("AfStrokeCohorts.sql") - sql <- renderSql(sql, - cdmDatabaseSchema = cdmDatabaseSchema, - cohortsDatabaseSchema = cohortsDatabaseSchema, - post_time = 30, - pre_time = 365)$sql - sql <- translateSql(sql, targetDialect = connectionDetails$dbms)$sql + sql <- render(sql, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortsDatabaseSchema = cohortsDatabaseSchema) + sql <- translate(sql, targetDialect = connectionDetails$dbms) connection <- connect(connectionDetails) executeSql(connection, sql) @@ -349,8 +347,8 @@ If all went well, we now have a table with the events of interest. We can see ho sql <- paste("SELECT cohort_definition_id, COUNT(*) AS count", "FROM @cohortsDatabaseSchema.AFibStrokeCohort", "GROUP BY cohort_definition_id") - sql <- renderSql(sql, cohortsDatabaseSchema = cohortsDatabaseSchema)$sql - sql <- translateSql(sql, targetDialect = connectionDetails$dbms)$sql + sql <- render(sql, cohortsDatabaseSchema = cohortsDatabaseSchema) + sql <- translate(sql, targetDialect = connectionDetails$dbms) querySql(connection, sql) ``` @@ -365,32 +363,33 @@ In this section we assume that our cohorts have been created either by using ATL ### Data extraction -Now we can tell `PatientLevelPrediction` to extract all necessary data for our analysis. This is done using the [`FeatureExtractionPackage`](https://github.com/OHDSI/FeatureExtraction). In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its [vignettes](https://github.com/OHDSI/FeatureExtraction). For our example study we decided to use these settings: +Now we can tell `PatientLevelPrediction` to extract all necessary data for our analysis. This is done using the [`FeatureExtraction`](https://github.com/OHDSI/FeatureExtraction) package. In short the `FeatureExtraction` package allows you to specify which features (`covariates`) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom `covariates`. For more detailed information on the `FeatureExtraction` package see its [vignettes](https://github.com/OHDSI/FeatureExtraction). For our example study we decided to use these settings: ```{r tidy=FALSE,eval=FALSE} + library(FeatureExtraction) covariateSettings <- createCovariateSettings(useDemographicsGender = TRUE, - useDemographicsAge = TRUE, - useConditionGroupEraLongTerm = TRUE, - useConditionGroupEraAnyTimePrior = TRUE, - useDrugGroupEraLongTerm = TRUE, - useDrugGroupEraAnyTimePrior = TRUE, - useVisitConceptCountLongTerm = TRUE, - longTermStartDays = -365, - endDays = -1) + useDemographicsAge = TRUE, + useConditionGroupEraLongTerm = TRUE, + useConditionGroupEraAnyTimePrior = TRUE, + useDrugGroupEraLongTerm = TRUE, + useDrugGroupEraAnyTimePrior = TRUE, + useVisitConceptCountLongTerm = TRUE, + longTermStartDays = -365, + endDays = -1) ``` -The final step for extracting the data is to run the `getPlpData` function and input the connection details, the database schema where the cohorts are stored, the cohort definition ids for the cohort and outcome, and the washoutPeriod which is the minimum number of days prior to cohort index date that the person must have been observed to be included into the data, and finally input the previously constructed covariate settings. +The final step for extracting the data is to run the `getPlpData()` function and input the connection details, the database schema where the cohorts are stored, the cohort definition ids for the cohort and outcome, and the `washoutPeriod` which is the minimum number of days prior to cohort index date that the person must have been observed to be included into the data, and finally input the previously constructed covariate settings. ```{r tidy=FALSE,eval=FALSE} - +library(PatientLevelPrediction) databaseDetails <- createDatabaseDetails( connectionDetails = connectionDetails, cdmDatabaseSchema = cdmDatabaseSchema, cdmDatabaseName = '', - cohortDatabaseSchema = resultsDatabaseSchema, + cohortDatabaseSchema = cohortsDatabaseSchema, cohortTable = 'AFibStrokeCohort', - cohortId = 1, - outcomeDatabaseSchema = resultsDatabaseSchema, + targetId = 1, + outcomeDatabaseSchema = cohortsDatabaseSchema, outcomeTable = 'AFibStrokeCohort', outcomeIds = 2, cdmVersion = 5 @@ -401,14 +400,13 @@ databaseDetails <- createDatabaseDetails( # or restricting to first index date (if people can be in target cohort multiple times) restrictPlpDataSettings <- createRestrictPlpDataSettings(sampleSize = 10000) - plpData <- getPlpData( - databaseDetails = databaseDetails, - covariateSettings = covariateSettings, - restrictPlpDataSettings = restrictPlpDataSettings - ) +plpData <- getPlpData(databaseDetails = databaseDetails, + covariateSettings = covariateSettings, + restrictPlpDataSettings = restrictPlpDataSettings + ) ``` -Note that if the cohorts are created in ATLAS its corresponding cohort database schema needs to be selected. There are many additional parameters for the `createRestrictPlpDataSettings` function which are all documented in the `PatientLevelPrediction` manual. The resulting `plpData` object uses the package `Andromeda` (which uses [SQLite](https://www.sqlite.org/index.html)) to store information in a way that ensures R does not run out of memory, even when the data are large. +Note that if the cohorts are created in ATLAS its corresponding cohort database schema needs to be selected. There are many additional parameters for the `getPlpData()` function which are all documented in the `PatientLevelPrediction` manual. The resulting `plpData` object uses the package `Andromeda` (which uses [SQLite](https://www.sqlite.org/index.html)) to store information in a way that ensures R does not run out of memory, even when the data are large. Creating the `plpData` object can take considerable computing time, and it is probably a good idea to save it for future sessions. Because `plpData` uses `Andromeda`, we cannot use R's regular save function. Instead, we'll have to use the `savePlpData()` function: @@ -420,9 +418,7 @@ We can use the `loadPlpData()` function to load the data in a future session. ### Additional inclusion criteria -To completely define the prediction problem the final study population is obtained by applying additional constraints on the two earlier defined cohorts, e.g., a minumim time at risk can be enforced (`requireTimeAtRisk, minTimeAtRisk`) and we can specify if this also applies to patients with the outcome (`includeAllOutcomes`). Here we also specify the start and end of the risk window relative to target cohort start. For example, if we like the risk window to start 30 days after the at-risk cohort start and end a year later we can set `riskWindowStart = 30` and `riskWindowEnd = 365`. In some cases the risk window needs to start at the cohort end date. This can be achieved by setting `addExposureToStart = TRUE` which adds the cohort (exposure) time to the start date. - -In Appendix 1, we demonstrate the effect of these settings on the subset of the persons in the target cohort that end up in the final study population. +To completely define the prediction problem the final study population is obtained by applying additional constraints on the two earlier defined cohorts, e.g., a minimum time at risk can be enforced (`requireTimeAtRisk`, `minTimeAtRisk`) and we can specify if this also applies to patients with the outcome (`includeAllOutcomes`). Here we also specify the start and end of the risk window relative to target cohort start. For example, if we like the risk window to start 30 days after the at-risk cohort start and end a year later we can set `riskWindowStart = 30` and `riskWindowEnd = 365`. In the example below all the settings we defined for our study are imposed: @@ -446,9 +442,9 @@ In the example below all the settings we defined for our study are imposed: When developing a prediction model using supervised learning (when you have features paired with labels for a set of patients), the first step is to design the development/internal validation process. This requires specifying how to select the model hyper-parameters, how to learn the model parameters and how to fairly evaluate the model. In general, the validation set is used to pick hyper-parameters, the training set is used to learn the model parameters and the test set is used to perform fair internal validation. However, cross-validation can be implemented to pick the hyper-parameters on the training data (so a validation data set is not required). Cross validation can also be used to estimate internal validation (so a testing data set is not required). -In small data the best approach for internal validation has been shown to be boostrapping. However, in big data (many patients and many features) bootstrapping is generally not feasible. In big data our research has shown that it is just important to have some form of fair evaluation (use a test set or cross validation). For full details see [our BMJ open paper](add%20link). +In small data the best approach for internal validation has been shown to be bootstrapping. However, in big data (many patients and many features) bootstrapping is generally not feasible. In big data our research has shown that it is adequate to have some form of fair evaluation (use a test set or cross validation). For full details see [our BMJ open paper](https://bmjopen.bmj.com/content/11/12/e050146.abstract). -In the PatientLevelPrediction package, the splitSettings define how the plpData are partitioned into training/validation/testing data. Cross validation is always done, but using a test set is optional (when the data are small, it may be optimal to not use a test set). For the splitSettings we can use the type (stratified/time/subject) and testFraction parameters to split the data in a 75%-25% split and run the patient-level prediction pipeline: +In the `PatientLevelPrediction` package, the `splitSettings` define how the `plpData` are partitioned into training/validation/testing data. They are created with `createDefaultSplitSetting()`. Cross validation is always done, but using a test set is optional (when the data are small, it may be optimal to not use a test set). For the `splitSettings` we can use the type (`stratified`/`time`/`subject`) and `testFraction` parameters to split the data in a 75%-25% split and run the patient-level prediction pipeline: ```{r tidy=FALSE,eval=FALSE} splitSettings <- createDefaultSplitSetting( @@ -460,7 +456,7 @@ In the PatientLevelPrediction package, the splitSettings define how the plpData ) ``` -Note: it is possible to add a custom method to specify how the plpData are partitioned into training/validation/testing data, see [vignette for custom splitting](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSplitting.pdf). +Note: it is possible to add a custom method to specify how the `plpData` are partitioned into training/validation/testing data, see `R vignette('AddingCustomSplitting')` ### Preprocessing the training data @@ -561,26 +557,26 @@ To load the full results structure use: ## Study Specification -| Definition | Value | -|----------------------|--------------------------------------------------| -| **Problem Definition** | | -| Target Cohort (T) | 'Patients who are newly dispensed an ACE inhibitor' defined as the first drug record of any ACE inhibitor | -| Outcome Cohort (O) | 'Angioedema' defined as an angioedema condition record during an inpatient or ER visit | -| Time-at-risk (TAR) | 1 day till 365 days from cohort start | -| | | -| **Population Definition** | | -| Washout Period | 365 | -| Enter the target cohort multiple times? | No | -| Allow prior outcomes? | No | -| Start of time-at-risk | 1 day | -| End of time-at-risk | 365 days | -| Require a minimum amount of time-at-risk? | Yes (364 days) | -| | | -| **Model Development** | | -| Algorithm | Gradient Boosting Machine | -| Hyper-parameters | ntree:5000, max depth:4 or 7 or 10 and learning rate: 0.001 or 0.01 or 0.1 or 0.9 | -| Covariates | Gender, Age, Conditions (ever before, \<365), Drugs Groups (ever before, \<365), and Visit Count | -| Data split | 75% train, 25% test. Randomly assigned by person | +| Definition | Value | +|------------------------------------|------------------------------------| +| **Problem Definition** | | +| Target Cohort (T) | 'Patients who are newly dispensed an ACE inhibitor' defined as the first drug record of any ACE inhibitor | +| Outcome Cohort (O) | 'Angioedema' defined as an angioedema condition record during an inpatient or ER visit | +| Time-at-risk (TAR) | 1 day till 365 days from cohort start | +| | | +| **Population Definition** | | +| Washout Period | 365 | +| Enter the target cohort multiple times? | No | +| Allow prior outcomes? | No | +| Start of time-at-risk | 1 day | +| End of time-at-risk | 365 days | +| Require a minimum amount of time-at-risk? | Yes (364 days) | +| | | +| **Model Development** | | +| Algorithm | Gradient Boosting Machine | +| Hyper-parameters | ntree:5000, max depth:4 or 7 or 10 and learning rate: 0.001 or 0.01 or 0.1 or 0.9 | +| Covariates | Gender, Age, Conditions (ever before, \<365), Drugs Groups (ever before, \<365), and Visit Count | +| Data split | 75% train, 25% test. Randomly assigned by person | According to the best practices we need to make a protocol that completely specifies how we plan to execute our study. This protocol will be assessed by the governance boards of the participating data sources in your network study. For this a template could be used but we prefer to automate this process as much as possible by adding functionality to automatically generate study protocol from a study specification. We will discuss this in more detail later. @@ -615,8 +611,8 @@ ATLAS allows you to define cohorts interactively by specifying cohort entry and The T and O cohorts can be found here: -- Ace inhibitors (T): -- Angioedema (O) : + + In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages [(link)](http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas). @@ -988,8 +984,8 @@ The script we created manually above can also be automatically created using a p
![](atlasplp4.web)
-] -\newpage + + ] \newpage ATLAS can build a R package for you that will execute the full study against you CDM. Below the steps are explained how to do this in ATLAS. @@ -1229,14 +1225,14 @@ This will extract the new plpData from the specified schemas and cohort tables. The package has much more functionality than described in this vignette and contributions have been made my many persons in the OHDSI community. The table below provides an overview: -| Functionality | Description | Vignette | -|-----------------|--------------------------------------|-----------------| -| Builing Multiple Models | This vignette describes how you can run multiple models automatically | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingMultiplePredictiveModels.pdf) | -| Custom Models | This vignette describes how you can add your own custom algorithms in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomModels.pdf) | -| Custom Splitting Functions | This vignette describes how you can add your own custom training/validation/testing splitting functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSplitting.pdf) | -| Custom Sampling Functions | This vignette describes how you can add your own custom sampling functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSamples.pdf) | -| Custom Feature Engineering/Selection | This vignette describes how you can add your own custom feature engineering and selection functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomFeatureEngineering.pdf) | -| Learning curves | Learning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below. | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/CreatingLearningCurves.pdf) | +| Functionality | Description | Vignette | +|------------------------|------------------------|------------------------| +| Builing Multiple Models | This vignette describes how you can run multiple models automatically | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingMultiplePredictiveModels.pdf) | +| Custom Models | This vignette describes how you can add your own custom algorithms in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomModels.pdf) | +| Custom Splitting Functions | This vignette describes how you can add your own custom training/validation/testing splitting functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSplitting.pdf) | +| Custom Sampling Functions | This vignette describes how you can add your own custom sampling functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSamples.pdf) | +| Custom Feature Engineering/Selection | This vignette describes how you can add your own custom feature engineering and selection functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomFeatureEngineering.pdf) | +| Learning curves | Learning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below. | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/CreatingLearningCurves.pdf) | # Demos @@ -1294,10 +1290,9 @@ In the figures below the effect is shown of the removeSubjectsWithPriorOutcome, -```{=tex} \newpage 3 -``` + )
@@ -1316,10 +1311,9 @@ In the figures below the effect is shown of the removeSubjectsWithPriorOutcome,
-```{=tex} \newpage 5 -``` + )
From 312739d2a7f0527cbbcfda42f958584bb5b4c9d4 Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 28 Oct 2024 13:13:12 +0100 Subject: [PATCH 10/15] remove ohdsi cohort links --- vignettes/BuildingPredictiveModels.Rmd | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vignettes/BuildingPredictiveModels.Rmd b/vignettes/BuildingPredictiveModels.Rmd index 4ed97cb75..df0e9383c 100644 --- a/vignettes/BuildingPredictiveModels.Rmd +++ b/vignettes/BuildingPredictiveModels.Rmd @@ -196,8 +196,6 @@ ATLAS allows you to define cohorts interactively by specifying cohort entry and The T and O cohorts can be found here: - - In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages [(link)](http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas). @@ -611,8 +609,6 @@ ATLAS allows you to define cohorts interactively by specifying cohort entry and The T and O cohorts can be found here: - - In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages [(link)](http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas). From 4f8a9c5f8903b1b7696577773005ca07772b0396 Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 28 Oct 2024 13:45:25 +0100 Subject: [PATCH 11/15] add build site again... --- .github/workflows/pkgdown.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 1aaf11b10..50f32fa91 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -35,6 +35,9 @@ jobs: - uses: lycheeverse/lychee-action@v2 with: args: --base . --verbose --no-progress --accept '100..=103, 200..=299, 403' './**/*.md' './**/*.Rmd' + + - name: Build site + run: Rscript -e 'pkgdown::build_site_github_pages(new_process = FALSE, install = TRUE)' - name: Fix Hades Logo run: Rscript -e 'OhdsiRTools::fixHadesLogo()' From 0adaeb39a701e2d490e748829e5bbb786f9b9811 Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 28 Oct 2024 16:20:53 +0100 Subject: [PATCH 12/15] try vignette links --- vignettes/BuildingPredictiveModels.Rmd | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/vignettes/BuildingPredictiveModels.Rmd b/vignettes/BuildingPredictiveModels.Rmd index df0e9383c..d1c422e56 100644 --- a/vignettes/BuildingPredictiveModels.Rmd +++ b/vignettes/BuildingPredictiveModels.Rmd @@ -118,8 +118,8 @@ tabl <- " | Naive Bayes | The Naive Bayes algorithm applies the Bayes theorem with the 'naive' assumption of conditional independence between every pair of features given the value of the class variable. Based on the likelihood the data belongs to a class and the prior distribution of the class, a posterior distribution is obtained. | none | | AdaBoost | AdaBoost is a boosting ensemble technique. Boosting works by iteratively adding classifiers but adds more weight to the data-points that are misclassified by prior classifiers in the cost function when training the next classifier. We use the sklearn 'AdaboostClassifier' implementation in Python. | nEstimators (the maximum number of estimators at which boosting is terminated), learningRate (learning rate shrinks the contribution of each classifier by learning_rate. There is a trade-off between learningRate and nEstimators) | | Decision Tree | A decision tree is a classifier that partitions the variable space using individual tests selected using a greedy approach. It aims to find partitions that have the highest information gain to separate the classes. The decision tree can easily overfit by enabling a large number of partitions (tree depth) and often needs some regularization (e.g., pruning or specifying hyper-parameters that limit the complexity of the model). We use the sklearn 'DecisionTreeClassifier' implementation in Python. | maxDepth (the maximum depth of the tree), minSamplesSplit,minSamplesLeaf, minImpuritySplit (threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.), seed,classWeight ('Balance' or 'None') | -| Multilayer Perception | Neural networks contain multiple layers that weight their inputs using a non-linear function. The first layer is the input layer, the last layer is the output layer the between are the hidden layers. Neural networks are generally trained using feed forward back-propagation. This is when you go through the network with a data-point and calculate the error between the true label and predicted label, then go backwards through the network and update the linear function weights based on the error. This can also be performed as a batch, where multiple data-points are fee| size (the number of hidden nodes), alpha (the l2 regularisation), seed | -| Deep Learning (now in seperate DeepPatientLevelPrediction R package) | Deep learning such as deep nets, convolutional neural networks or recurrent neural networks are similar to a neural network but have multiple hidden layers that aim to learn latent representations useful for prediction. In the seperate BuildingDeepLearningModels vignette we describe these models and hyper-parameters in more detail | see OHDSI/DeepPatientLevelPrediction| +| Multilayer Perception | Neural networks contain multiple layers that weight their inputs using a non-linear function. The first layer is the input layer, the last layer is the output layer the between are the hidden layers. Neural networks are generally trained using feed forward back-propagation. This is when you go through the network with a data-point and calculate the error between the true label and predicted label, then go backwards through the network and update the linear function weights based on the error. | size (the number of hidden nodes), alpha (the l2 regularisation), seed | +| Deep Learning (now in seperate DeepPatientLevelPrediction R package) | Deep learning such as deep nets, convolutional neural networks or recurrent neural networks are similar to a neural network but have multiple hidden layers that aim to learn latent representations useful for prediction. | see: https://github.com/OHDSI/DeepPatientLevelPrediction| " cat(tabl) # output the table in a format good for HTML/PDF/docx conversion ``` @@ -145,7 +145,7 @@ For our first prediction model we decide to start with a Regularized Logistic Re | Definition | Value | |------------------------------------|------------------------------------| | **Problem Definition** | | -| Target Cohort (T) | 'Patients who are newly diagnosed with Atrial Fibrillation' defined as the first condition record of cardiac arrhythmia, which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. | +| Target Cohort (T) | 'Patients who are newly diagnosed with Atrial Fibrillation' defined as the first condition record of cardiac arrhythmia, which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmia, or a procedure to treat arrhythmia. | | Outcome Cohort (O) | 'Ischemic stroke events' defined as ischemic stroke condition records during an inpatient or ER visit; successive records with \> 180 day gap are considered independent episodes. | | Time-at-risk (TAR) | 1 day till 365 days from cohort start | | | | @@ -188,24 +188,23 @@ Both methods are described below for our example prediction problem. ### ATLAS cohort builder -![Target Cohort Atrial Fibrillation](example1/ATLAS_T.webp) +![Figure 4: Target Cohort Atrial Fibrillation](example1/ATLAS_T.webp) ATLAS allows you to define cohorts interactively by specifying cohort entry and cohort exit criteria. Cohort entry criteria involve selecting one or more initial events, which determine the start date for cohort entry, and optionally specifying additional inclusion criteria which filter to the qualifying events. Cohort exit criteria are applied to each cohort entry record to determine the end date when the person's episode no longer qualifies for the cohort. For the outcome cohort the end date is less relevant. As an example, Figure 4 shows how we created the Atrial Fibrillation cohort and Figure 5 shows how we created the stroke cohort in ATLAS. -![Outcome Cohort Stroke](example1/ATLAS_O.webp) +![Figure 5: Outcome Cohort Stroke](example1/ATLAS_O.webp) The T and O cohorts can be found here: - In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages [(link)](http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas). -Note that when a cohort is created in ATLAS the cohortid is needed to extract the data in R. The cohortid can be found at the top of the ATLAS screen, e.g. 1769447 in Figure 4. +Note that when a cohort is created in ATLAS the `cohortId` is needed to extract the data in R. The `cohortId` can be found at the top of the ATLAS screen, e.g. 1769447 in Figure 4. ### Custom cohorts It is also possible to create cohorts without the use of ATLAS. Using custom cohort code (SQL) you can make more advanced cohorts if needed. -For our example study, we need to create at table to hold the cohort data and we need to create SQL code to instantiate this table for both the AF and Stroke cohorts. Therefore, we create a file called *AfStrokeCohorts.sql* with the following contents: +For our example study, we need to create at table to hold the cohort data and we need to create SQL code to instantiate this table for both the AF and Stroke cohorts. Therefore, we create a file called *`AfStrokeCohorts.sql`* with the following contents: ```{sql, eval=FALSE} /*********************************** @@ -376,7 +375,7 @@ Now we can tell `PatientLevelPrediction` to extract all necessary data for our a endDays = -1) ``` -The final step for extracting the data is to run the `getPlpData()` function and input the connection details, the database schema where the cohorts are stored, the cohort definition ids for the cohort and outcome, and the `washoutPeriod` which is the minimum number of days prior to cohort index date that the person must have been observed to be included into the data, and finally input the previously constructed covariate settings. +The final step for extracting the data is to run the `getPlpData()` function and input the connection details, the database schema where the cohorts are stored, the cohort definition ids for the cohort and outcome, and the `washoutPeriod` which is the minimum number of days prior to cohort index date that the person must have been observed to be included into the data, and finally input the previously constructed `covariateSettings`. ```{r tidy=FALSE,eval=FALSE} library(PatientLevelPrediction) @@ -436,7 +435,7 @@ In the example below all the settings we defined for our study are imposed: ) ``` -### Spliting the data into training/validation/testing datasets +### Splitting the data into training/validation/testing datasets When developing a prediction model using supervised learning (when you have features paired with labels for a set of patients), the first step is to design the development/internal validation process. This requires specifying how to select the model hyper-parameters, how to learn the model parameters and how to fairly evaluate the model. In general, the validation set is used to pick hyper-parameters, the training set is used to learn the model parameters and the test set is used to perform fair internal validation. However, cross-validation can be implemented to pick the hyper-parameters on the training data (so a validation data set is not required). Cross validation can also be used to estimate internal validation (so a testing data set is not required). @@ -454,7 +453,7 @@ In the `PatientLevelPrediction` package, the `splitSettings` define how the `plp ) ``` -Note: it is possible to add a custom method to specify how the `plpData` are partitioned into training/validation/testing data, see `R vignette('AddingCustomSplitting')` +Note: it is possible to add a custom method to specify how the `plpData` are partitioned into training/validation/testing data, see `vignette('AddingCustomSplitting')` ### Preprocessing the training data @@ -609,7 +608,6 @@ ATLAS allows you to define cohorts interactively by specifying cohort entry and The T and O cohorts can be found here: - In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages [(link)](http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas). Note that when a cohort is created in ATLAS the cohortid is needed to extract the data in R. The cohortid can be found at the top of the ATLAS screen, e.g. 1770617 in Figure 6. From c8335d08a9a43b9b043a00b4512f5dc811af46fd Mon Sep 17 00:00:00 2001 From: egillax Date: Fri, 1 Nov 2024 10:06:09 +0100 Subject: [PATCH 13/15] spelling fix in vignette --- vignettes/BuildingPredictiveModels.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/BuildingPredictiveModels.Rmd b/vignettes/BuildingPredictiveModels.Rmd index d1c422e56..c744a99c5 100644 --- a/vignettes/BuildingPredictiveModels.Rmd +++ b/vignettes/BuildingPredictiveModels.Rmd @@ -73,7 +73,7 @@ Atrial fibrillation is a disease characterized by an irregular heart rate that c We will apply the PatientLevelPrediction package to observational healthcare data to address the following patient-level prediction question: -Amongt patients who are newly diagnosed with Atrial Fibrillation, which patients will go on to have Ischemic Stroke within 1 year? +Among patients who are newly diagnosed with Atrial Fibrillation, which patients will go on to have Ischemic Stroke within 1 year? We will define 'patients who are newly diagnosed with Atrial Fibrillation' as the first condition record of cardiac arrhythmia, which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. We will define 'Ischemic stroke events' as ischemic stroke condition records during an inpatient or ER visit; successive records with \> 180 day gap are considered independent episodes. From 0eb5da8cee8ee93b49714861d3b6162869f0c133 Mon Sep 17 00:00:00 2001 From: egillax Date: Fri, 1 Nov 2024 11:20:15 +0100 Subject: [PATCH 14/15] fix broken cohort links to point to new urls for the atlas demo --- vignettes/BuildingPredictiveModels.Rmd | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vignettes/BuildingPredictiveModels.Rmd b/vignettes/BuildingPredictiveModels.Rmd index c744a99c5..b82e1e45c 100644 --- a/vignettes/BuildingPredictiveModels.Rmd +++ b/vignettes/BuildingPredictiveModels.Rmd @@ -196,6 +196,9 @@ ATLAS allows you to define cohorts interactively by specifying cohort entry and The T and O cohorts can be found here: +- Atrial Fibrillaton (T): +- Stroke (O) : + In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages [(link)](http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas). Note that when a cohort is created in ATLAS the `cohortId` is needed to extract the data in R. The `cohortId` can be found at the top of the ATLAS screen, e.g. 1769447 in Figure 4. @@ -608,6 +611,9 @@ ATLAS allows you to define cohorts interactively by specifying cohort entry and The T and O cohorts can be found here: +- Ace inhibitors (T): +- Angioedema (O) : + In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages [(link)](http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas). Note that when a cohort is created in ATLAS the cohortid is needed to extract the data in R. The cohortid can be found at the top of the ATLAS screen, e.g. 1770617 in Figure 6. From 2d8f2259ceaff0a31b06d412232adc5e63d5d867 Mon Sep 17 00:00:00 2001 From: egillax Date: Fri, 1 Nov 2024 13:55:46 +0100 Subject: [PATCH 15/15] fix linting and add news --- .github/workflows/pkgdown.yaml | 2 +- NEWS.md | 4 + README.md | 16 +- vignettes/AddingCustomFeatureEngineering.Rmd | 60 ++- vignettes/AddingCustomModels.Rmd | 323 ++++++------- vignettes/AddingCustomSamples.Rmd | 77 ++- vignettes/AddingCustomSplitting.Rmd | 20 +- .../BuildingMultiplePredictiveModels.Rmd | 119 +++-- vignettes/BuildingPredictiveModels.Rmd | 448 +++++++++--------- vignettes/ConstrainedPredictors.Rmd | 2 +- vignettes/CreatingLearningCurves.Rmd | 57 +-- vignettes/CreatingNetworkStudies.Rmd | 8 +- vignettes/InstallationGuide.Rmd | 7 +- 13 files changed, 557 insertions(+), 586 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 50f32fa91..83fa4efc1 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -2,7 +2,7 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, develop, broken_links] + branches: [main, develop] release: types: [published] workflow_dispatch: diff --git a/NEWS.md b/NEWS.md index 816369a51..627b8acca 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ PatientLevelPrediction develop ====================== +- Fixed linting errors and R codestyle in docs to conform to HADES style +- Remove links to pdf's, point to website instead. +- Fix broken links in Readme and BuildingPredictiveModels vignette +- Added an action to detect broken links in repo - Official maintainer updated to Egill Fridgeirsson diff --git a/README.md b/README.md index f70f8b13c..9587cfa6f 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ install Python 3.9 or higher using Anaconda (https://www.continuum.io/downloads) Getting Started =============== -- To install the package please read the [Package Installation guide](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/InstallationGuide.pdf) +- To install the package please read the [Package Installation guide](https://ohdsi.github.io/PatientLevelPrediction/articles/InstallationGuide.html) - Have a look at the video below for an extensive demo of the package. @@ -93,24 +93,22 @@ alt="Video Vignette PLP Package" width="240" height="180" border="10" /> Please read the main vignette for the package: -- [Building Single Patient-Level Predictive Models](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf) +- [Building Single Patient-Level Predictive Models](https://ohdsi.github.io/PatientLevelPrediction/articles/BuildingPredictiveModels.html) In addition we have created vignettes that describe advanced functionality in more detail: -- [Building Multiple Patient-Level Predictive Models](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingMultiplePredictiveModels.pdf) -- [Adding Custom Machine Learning Algorithms](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomModels.pdf) +- [Building Multiple Patient-Level Predictive Models](https://ohdsi.github.io/PatientLevelPrediction/articles/BuildingMultiplePredictiveModels.html) +- [Adding Custom Machine Learning Algorithms](https://ohdsi.github.io/PatientLevelPrediction/articles/AddingCustomModels.html) - [Building Deep Learning Models](https://github.com/OHDSI/DeepPatientLevelPrediction) - [Building Ensemble Models](https://github.com/OHDSI/EnsemblePatientLevelPrediction) -- [Creating Learning Curves](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/CreatingLearningCurves.pdf) +- [Creating Learning Curves](https://ohdsi.github.io/PatientLevelPrediction/articles/CreatingLearningCurves.html) -Package manual: [PatientLevelPrediction.pdf](https://github.com/OHDSI/PatientLevelPrediction/blob/main/extras/PatientLevelPrediction.pdf) +Package function reference: [Reference](https://ohdsi.github.io/PatientLevelPrediction/reference/index.html) User Documentation ================== Documentation can be found on the [package website](https://ohdsi.github.io/PatientLevelPrediction). -PDF versions of the documentation are also available, as mentioned above. - Support ======= * Developer questions/comments/feedback: OHDSI Forum @@ -132,5 +130,5 @@ PatientLevelPrediction is being developed in R Studio. # Acknowledgements - The package is maintained by Egill Fridgeirsson and Jenna Reps and has been developed with major contributions from Peter Rijnbeek, Martijn Schuemie, Patrick Ryan, and Marc Suchard. -- We like to thank the following persons for their contributions to the package: Seng Chan You, Ross Williams, Henrik John, Xiaoyong Pan, James Wiggins, Alex Rekkas +- We like to thank the following persons for their contributions to the package: Seng Chan You, Ross Williams, Henrik John, Xiaoyong Pan, James Wiggins, Alexandros Rekkas - This project is supported in part through the National Science Foundation grant IIS 1251151. diff --git a/vignettes/AddingCustomFeatureEngineering.Rmd b/vignettes/AddingCustomFeatureEngineering.Rmd index fab612c7f..0ec7c921f 100644 --- a/vignettes/AddingCustomFeatureEngineering.Rmd +++ b/vignettes/AddingCustomFeatureEngineering.Rmd @@ -32,7 +32,7 @@ library(PatientLevelPrediction) # Introduction -This vignette describes how you can add your own custom function for feature engineering in the Observational Health Data Sciences and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf). +This vignette describes how you can add your own custom function for feature engineering in the Observational Health Data Sciences and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the `vignette('BuildingPredictiveModels')`. **We invite you to share your new feature engineering functions with the OHDSI community through our [GitHub repository](http://github.com/OHDSI/PatientLevelPrediction).** @@ -65,22 +65,18 @@ Let's consider the situation where we wish to create an age spline feature. To m Our age spline feature function will create a new feature using the `plpData$cohorts$ageYear` column. We will implement a restricted cubic spline that requires specifying the number of knots. Therefore, the inputs for this are: `knots` - an integer/double specifying the number of knots. ```{r, echo = TRUE, eval=FALSE} -createAgeSpline <- function( - knots = 5 - ){ - +createAgeSpline <- function(knots = 5) { # create list of inputs to implement function featureEngineeringSettings <- list( knots = knots - ) - + ) + # specify the function that will implement the sampling attr(featureEngineeringSettings, "fun") <- "implementAgeSplines" # make sure the object returned is of class "sampleSettings" class(featureEngineeringSettings) <- "featureEngineeringSettings" return(featureEngineeringSettings) - } ``` @@ -93,7 +89,7 @@ All 'implement' functions must take as input the `trainData` and the `featureEng In our example, the `createAgeSpline()` will return a list with 'knots'. The `featureEngineeringSettings` therefore contains this. ```{r tidy=FALSE,eval=FALSE} -implementAgeSplines <- function(trainData, featureEngineeringSettings, model=NULL) { +implementAgeSplines <- function(trainData, featureEngineeringSettings, model = NULL) { # if there is a model, it means this function is called through applyFeatureengineering, meaning it # should apply the model fitten on training data to the test data if (is.null(model)) { knots <- featureEngineeringSettings$knots @@ -101,19 +97,18 @@ implementAgeSplines <- function(trainData, featureEngineeringSettings, model=NUL y <- ageData$outcomeCount X <- ageData$ageYear model <- mgcv::gam( - y ~ s(X, bs='cr', k=knots, m=2) + y ~ s(X, bs = "cr", k = knots, m = 2) ) newData <- data.frame( rowId = ageData$rowId, covariateId = 2002, covariateValue = model$fitted.values ) - } - else { + } else { ageData <- trainData$labels X <- trainData$labels$ageYear y <- ageData$outcomeCount - newData <- data.frame(y=y, X=X) + newData <- data.frame(y = y, X = X) yHat <- predict(model, newData) newData <- data.frame( rowId = trainData$labels$rowId, @@ -121,37 +116,40 @@ implementAgeSplines <- function(trainData, featureEngineeringSettings, model=NUL covariateValue = yHat ) } - - # remove existing age if in covariates - trainData$covariateData$covariates <- trainData$covariateData$covariates |> - dplyr::filter(!covariateId %in% c(1002)) - + + # remove existing age if in covariates + trainData$covariateData$covariates <- trainData$covariateData$covariates |> + dplyr::filter(!.data$covariateId %in% c(1002)) + # update covRef - Andromeda::appendToTable(trainData$covariateData$covariateRef, - data.frame(covariateId=2002, - covariateName='Cubic restricted age splines', - analysisId=2, - conceptId=2002)) - + Andromeda::appendToTable( + trainData$covariateData$covariateRef, + data.frame( + covariateId = 2002, + covariateName = "Cubic restricted age splines", + analysisId = 2, + conceptId = 2002 + ) + ) + # update covariates Andromeda::appendToTable(trainData$covariateData$covariates, newData) - + featureEngineering <- list( - funct = 'implementAgeSplines', + funct = "implementAgeSplines", settings = list( featureEngineeringSettings = featureEngineeringSettings, model = model ) ) - - attr(trainData$covariateData, 'metaData')$featureEngineering = listAppend( - attr(trainData$covariateData, 'metaData')$featureEngineering, + + attr(trainData$covariateData, "metaData")$featureEngineering <- listAppend( + attr(trainData$covariateData, "metaData")$featureEngineering, featureEngineering ) - + return(trainData) } - ``` # Acknowledgments diff --git a/vignettes/AddingCustomModels.Rmd b/vignettes/AddingCustomModels.Rmd index 84b21e7b4..c7d187625 100644 --- a/vignettes/AddingCustomModels.Rmd +++ b/vignettes/AddingCustomModels.Rmd @@ -32,7 +32,7 @@ library(PatientLevelPrediction) # Introduction -This vignette describes how you can add your own custom algorithms in the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This allows you to fully leverage the OHDSI PatientLevelPrediction framework for model development and validation. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf). +This vignette describes how you can add your own custom algorithms in the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This allows you to fully leverage the OHDSI PatientLevelPrediction framework for model development and validation. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the `vignette('BuildingPredictiveModels')`. **We invite you to share your new algorithms with the OHDSI community through our [GitHub repository](http://github.com/OHDSI/PatientLevelPrediction).** @@ -50,42 +50,41 @@ The set\ is a function that takes as input the different hyper-parameter The param object can have a setttings attribute containing any extra settings. For example to specify the model name and the seed used for reproducibility: ```{r, echo = TRUE, eval=FALSE} -attr(param, 'settings') <- list( +attr(param, "settings") <- list( seed = 12, modelName = "Special classifier" - ) +) ``` For example, if you were adding a model called madeUp that has two hyper-parameters then the set function should be: ```{r tidy=FALSE,eval=FALSE} -setMadeUp <- function(a=c(1,4,10), b=2, seed=NULL){ +setMadeUp <- function(a = c(1, 4, 10), b = 2, seed = NULL) { # add input checks here... - + param <- split( expand.grid( - a=a, - b=b + a = a, + b = b ), - 1:(length(a)*length(b)) - ) - - attr(param, 'settings') <- list( + 1:(length(a) * length(b)) + ) + + attr(param, "settings") <- list( modelName = "Made Up", requiresDenseMatrix = TRUE, seed = seed - ) - + ) + # now create list of all combinations: result <- list( - fitFunction = 'fitMadeUp', # this will be called to train the made up model + fitFunction = "fitMadeUp", # this will be called to train the made up model param = param ) - class(result) <- 'modelSettings' - + class(result) <- "modelSettings" + return(result) } - ``` ## Fit @@ -140,36 +139,35 @@ Note: If a new modelType is desired, then the evalaution code within PatientLeve A full example of a custom 'binary' classifier fit function is: ```{r tidy=FALSE,eval=FALSE} -fitMadeUp <- function(trainData, modelSettings, search, analysisId){ - +fitMadeUp <- function(trainData, modelSettings, search, analysisId) { param <- modelSettings$param - + # **************** code to train the model here - # trainedModel <- this code should apply each hyper-parameter combination + # trainedModel <- this code should apply each hyper-parameter combination # (param[[i]]) using the specified search (e.g., cross validation) # then pick out the best hyper-parameter setting - # and finally fit a model on the whole train data using the + # and finally fit a model on the whole train data using the # optimal hyper-parameter settings # **************** - + # **************** code to apply the model to trainData # prediction <- code to apply trainedModel to trainData # **************** - + # **************** code to get variable importance (if possible) # varImp <- code to get importance of each variable in trainedModel # **************** - - + + # construct the standard output for a model: - result <- list(model = trainedModel, - prediction = prediction, # the train and maybe the cross validation predictions for the trainData - preprocessing = list( - featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, - tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, - requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix, - - ), + result <- list( + model = trainedModel, + prediction = prediction, # the train and maybe the cross validation predictions for the trainData + preprocessing = list( + featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, + tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, + requireDenseMatrix = attr(param, "settings")$requiresDenseMatrix, + ), modelDesign = list( outcomeId = attr(trainData, "metaData")$outcomeId, targetId = attr(trainData, "metaData")$targetId, @@ -177,37 +175,34 @@ fitMadeUp <- function(trainData, modelSettings, search, analysisId){ covariateSettings = attr(trainData, "metaData")$covariateSettings, populationSettings = attr(trainData, "metaData")$populationSettings, featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings, - prerocessSettings = attr(trainData$covariateData, "metaData")$prerocessSettings, + prerocessSettings = attr(trainData$covariateData, "metaData")$prerocessSettings, modelSettings = list( - model = attr(param, 'settings')$modelName, # the model name + model = attr(param, "settings")$modelName, # the model name param = param, finalModelParameters = param[[bestInd]], # best hyper-parameters - extraSettings = attr(param, 'settings') + extraSettings = attr(param, "settings") ), splitSettings = attr(trainData, "metaData")$splitSettings, sampleSettings = attr(trainData, "metaData")$sampleSettings ), - trainDetails = list( analysisId = analysisId, developmentDatabase = attr(trainData, "metaData")$cdmDatabaseSchema, - attrition = attr(trainData, "metaData")$attrition, + attrition = attr(trainData, "metaData")$attrition, trainingTime = timeToTrain, # how long it took to train the model trainingDate = Sys.Date(), hyperParamSearch = hyperSummary # the hyper-parameters and performance data.frame ), - covariateImportance = merge(trainData$covariateData$covariateRef, varImp, by='covariateId') # add variable importance to covariateRef if possible + covariateImportance = merge(trainData$covariateData$covariateRef, varImp, by = "covariateId") # add variable importance to covariateRef if possible ) - class(result) <- 'plpModel' - attr(result, 'predictionFunction') <- 'madeupPrediction' - attr(result, 'modelType') <- 'binary' + class(result) <- "plpModel" + attr(result, "predictionFunction") <- "madeupPrediction" + attr(result, "modelType") <- "binary" return(result) - } - ``` -You could make the fitMadeUp function cleaner by adding helper function in the MadeUp.R file that are called by the fit function (for example a function to run cross validation). It is important to ensure there is a valid prediction function (the one specified by `attr(result, 'predictionFunction') <- 'madeupPrediction'` is `madeupPrediction()`) as specified below. +You could make the fitMadeUp function cleaner by adding helper function in the MadeUp.R file that are called by the fit function (for example a function to run cross validation). It is important to ensure there is a valid prediction function (the one specified by `attr(result, "predictionFunction") <- "madeupPrediction"` is `madeupPrediction()`) as specified below. ## Predict @@ -218,17 +213,15 @@ The prediction function takes as input the plpModel returned by fit, new data an For example: ```{r tidy=FALSE,eval=FALSE} -madeupPrediction <- function(plpModel, data, cohort){ - +madeupPrediction <- function(plpModel, data, cohort) { # ************* code to do prediction for each rowId in cohort # predictionValues <- code to do prediction here returning the predicted risk - # (value) for each rowId in cohort + # (value) for each rowId in cohort #************** - - prediction <- merge(cohort, predictionValues, by='rowId') - attr(prediction, "metaData") <- list(modelType = attr(plpModel, 'modelType')) + + prediction <- merge(cohort, predictionValues, by = "rowId") + attr(prediction, "metaData") <- list(modelType = attr(plpModel, "modelType")) return(prediction) - } ``` @@ -239,117 +232,115 @@ Below a fully functional algorithm example is given, however we highly recommend ## Set ```{r tidy=FALSE,eval=FALSE} -setMadeUp <- function(a=c(1,4,6), b=2, seed=NULL){ +setMadeUp <- function(a = c(1, 4, 6), b = 2, seed = NULL) { # add input checks here... - - if(is.null(seed)){ - seed <- sample(100000,1) + + if (is.null(seed)) { + seed <- sample(100000, 1) } - + param <- split( expand.grid( - a=a, - b=b + a = a, + b = b ), - 1:(length(a)*length(b)) - ) - - attr(param, 'settings') <- list( + 1:(length(a) * length(b)) + ) + + attr(param, "settings") <- list( modelName = "Made Up", requiresDenseMatrix = TRUE, seed = seed - ) - + ) + # now create list of all combinations: result <- list( - fitFunction = 'fitMadeUp', # this will be called to train the made up model + fitFunction = "fitMadeUp", # this will be called to train the made up model param = param ) - class(result) <- 'modelSettings' - + class(result) <- "modelSettings" + return(result) } - ``` ## Fit ```{r tidy=FALSE,eval=FALSE} -fitMadeUp <- function(trainData, modelSettings, search, analysisId){ - +fitMadeUp <- function(trainData, modelSettings, search, analysisId) { # set the seed for reproducibility param <- modelSettings$param - set.seed(attr(param, 'settings')$seed) - + set.seed(attr(param, "settings")$seed) + # add folds to labels: - trainData$labels <- merge(trainData$labels, trainData$folds, by= 'rowId') + trainData$labels <- merge(trainData$labels, trainData$folds, by = "rowId") # convert data into sparse R Matrix: - mappedData <- toSparseM(trainData,map=NULL) + mappedData <- toSparseM(trainData, map = NULL) matrixData <- mappedData$dataMatrix labels <- mappedData$labels covariateRef <- mappedData$covariateRef - #============= STEP 1 ====================================== + # ============= STEP 1 ====================================== # pick the best hyper-params and then do final training on all data... - writeLines('Cross validation') - param.sel <- lapply( - param, - function(x){ + writeLines("Cross validation") + paramSel <- lapply( + param, + function(x) { do.call( - made_up_model, + madeUpModel, list( - param = x, - final = F, - data = matrixData, + param = x, + final = FALSE, + data = matrixData, labels = labels - ) + ) ) - } - ) - hyperSummary <- do.call(rbind, lapply(param.sel, function(x) x$hyperSum)) + } + ) + hyperSummary <- do.call(rbind, lapply(paramSel, function(x) x$hyperSum)) hyperSummary <- as.data.frame(hyperSummary) - hyperSummary$auc <- unlist(lapply(param.sel, function(x) x$auc)) - param.sel <- unlist(lapply(param.sel, function(x) x$auc)) - bestInd <- which.max(param.sel) - - #get cross val prediction for best hyper-parameters + hyperSummary$auc <- unlist(lapply(paramSel, function(x) x$auc)) + paramSel <- unlist(lapply(paramSel, function(x) x$auc)) + bestInd <- which.max(paramSel) + + # get cross val prediction for best hyper-parameters prediction <- param.sel[[bestInd]]$prediction - prediction$evaluationType <- 'CV' - - writeLines('final train') + prediction$evaluationType <- "CV" + + writeLines("final train") finalResult <- do.call( - made_up_model, + madeUpModel, list( - param = param[[bestInd]], - final = T, - data = matrixData, + param = param[[bestInd]], + final = TRUE, + data = matrixData, labels = labels - ) ) - + ) + trainedModel <- finalResult$model - + # prediction risk on training data: - finalResult$prediction$evaluationType <- 'Train' - + finalResult$prediction$evaluationType <- "Train" + # get CV and train prediction prediction <- rbind(prediction, finalResult$prediction) - + varImp <- covariateRef %>% dplyr::collect() # no feature importance available - vqrImp$covariateValue <- 0 - - timeToTrain <- Sys.time() - start + vqrImp$covariateValue <- 0 + + timeToTrain <- Sys.time() - start # construct the standard output for a model: - result <- list(model = trainedModel, - prediction = prediction, + result <- list( + model = trainedModel, + prediction = prediction, preprocessing = list( - featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, - tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, - requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix, - - ), + featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, + tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, + requireDenseMatrix = attr(param, "settings")$requiresDenseMatrix, + ), modelDesign = list( outcomeId = attr(trainData, "metaData")$outcomeId, targetId = attr(trainData, "metaData")$targetId, @@ -357,91 +348,84 @@ fitMadeUp <- function(trainData, modelSettings, search, analysisId){ covariateSettings = attr(trainData, "metaData")$covariateSettings, populationSettings = attr(trainData, "metaData")$populationSettings, featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings, - prerocessSettings = attr(trainData$covariateData, "metaData")$prerocessSettings, + prerocessSettings = attr(trainData$covariateData, "metaData")$prerocessSettings, modelSettings = list( - model = attr(param, 'settings')$modelName, # the model name + model = attr(param, "settings")$modelName, # the model name param = param, finalModelParameters = param[[bestInd]], # best hyper-parameters - extraSettings = attr(param, 'settings') + extraSettings = attr(param, "settings") ), splitSettings = attr(trainData, "metaData")$splitSettings, sampleSettings = attr(trainData, "metaData")$sampleSettings ), - trainDetails = list( analysisId = analysisId, developmentDatabase = attr(trainData, "metaData")$cdmDatabaseSchema, - attrition = attr(trainData, "metaData")$attrition, + attrition = attr(trainData, "metaData")$attrition, trainingTime = timeToTrain, # how long it took to train the model trainingDate = Sys.Date(), hyperParamSearch = hyperSummary # the hyper-parameters and performance data.frame ), - covariateImportance = merge(trainData$covariateData$covariateRef, varImp, by='covariateId') # add variable importance to covariateRef if possible - ), - covariateImportance = varImp + covariateImportance = merge(trainData$covariateData$covariateRef, varImp, by = "covariateId") # add variable importance to covariateRef if possible ) - class(result) <- 'plpModel' - attr(result, 'predictionFunction') <- 'madeupPrediction' - attr(result, 'modelType') <- 'binary' + class(result) <- "plpModel" + attr(result, "predictionFunction") <- "madeupPrediction" + attr(result, "modelType") <- "binary" return(result) - } - ``` ## Helpers -In the fit model a helper function `made_up_model` is called, this is the function that trains a model given the data, labels and hyper-parameters. +In the fit model a helper function `madeUpModel` is called, this is the function that trains a model given the data, labels and hyper-parameters. ```{r tidy=FALSE,eval=FALSE} -made_up_model <- function(param, data, final=F, labels){ - - if(final==F){ +madeUpModel <- function(param, data, final = FALSE, labels) { + if (final == FALSE) { # add value column to store all predictions labels$value <- rep(0, nrow(labels)) attr(labels, "metaData") <- list(modelType = "binary") - + foldPerm <- c() # this holds CV aucs - for(index in 1:max(labels$index)){ + for (index in 1:max(labels$index)) { model <- madeup::model( - x = data[labels$index!=index,], # remove left out fold - y = labels$outcomeCount[labels$index!=index], - a = param$a, + x = data[labels$index != index, ], # remove left out fold + y = labels$outcomeCount[labels$index != index], + a = param$a, b = param$b ) - + # predict on left out fold - pred <- stats::predict(model, data[labels$index==index,]) - labels$value[labels$index==index] <- pred - - # calculate auc on help out fold - aucVal <- computeAuc(labels[labels$index==index,]) - foldPerm<- c(foldPerm,aucVal) + pred <- stats::predict(model, data[labels$index == index, ]) + labels$value[labels$index == index] <- pred + + # calculate auc on help out fold + aucVal <- computeAuc(labels[labels$index == index, ]) + foldPerm <- c(foldPerm, aucVal) } auc <- computeAuc(labels) # overal AUC - } else { model <- madeup::model( - x = data, + x = data, y = labels$outcomeCount, a = param$a, b = param$b - ) - + ) + pred <- stats::predict(model, data) labels$value <- pred - attr(labels, "metaData") <- list(modelType = "binary") + attr(labels, "metaData") <- list(modelType = "binary") auc <- computeAuc(labels) foldPerm <- auc } - + result <- list( model = model, auc = auc, prediction = labels, hyperSum = c(a = a, b = b, fold_auc = foldPerm) ) - + return(result) } ``` @@ -451,41 +435,38 @@ made_up_model <- function(param, data, final=F, labels){ The final step is to create a predict function for the model. In the example above the predeiction function `attr(result, 'predictionFunction') <- 'madeupPrediction'` was madeupPrediction, so a `madeupPrediction` function is required when applying the model. The predict function needs to take as input the plpModel returned by the fit function, new data to apply the model on and the cohort specifying the patients of interest to make the prediction for. ```{r tidy=FALSE,eval=FALSE} -madeupPrediction <- function(plpModel, data , cohort){ - - if(class(data) == 'plpData'){ +madeupPrediction <- function(plpModel, data, cohort) { + if (class(data) == "plpData") { # convert matrixObjects <- toSparseM( - plpData = data, + plpData = data, cohort = cohort, - map = plpModel$covariateImportance %>% + map = plpModel$covariateImportance %>% dplyr::select("columnId", "covariateId") ) - + newData <- matrixObjects$dataMatrix cohort <- matrixObjects$labels - - }else{ + } else { newData <- data } - - if(class(plpModel) == 'plpModel'){ + + if (class(plpModel) == "plpModel") { model <- plpModel$model - } else{ + } else { model <- plpModel } - - cohort$value <- stats::predict(model, data) - + + cohort$value <- stats::predict(model, newData) + # fix the rowIds to be the old ones # now use the originalRowId and remove the matrix rowId - cohort <- cohort %>% + cohort <- cohort %>% dplyr::select(-"rowId") %>% dplyr::rename(rowId = "originalRowId") - - attr(cohort, "metaData") <- list(modelType = attr(plpModel, 'modelType')) + + attr(cohort, "metaData") <- list(modelType = attr(plpModel, "modelType")) return(cohort) - } ``` diff --git a/vignettes/AddingCustomSamples.Rmd b/vignettes/AddingCustomSamples.Rmd index eb2da04f1..93241c085 100644 --- a/vignettes/AddingCustomSamples.Rmd +++ b/vignettes/AddingCustomSamples.Rmd @@ -41,8 +41,7 @@ and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the -[`BuildingPredictiveModels` -vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf). +`vignette('BuildingPredictiveModels')`. **We invite you to share your new sample functions with the OHDSI community through our [GitHub @@ -86,29 +85,25 @@ specifying the number of patients to sample \* `sampleSeed` an integer/double specifying the seed for reproducibility ```{r, echo = TRUE, eval=FALSE} -createRandomSampleSettings <- function( - n = 10000, - sampleSeed = sample(10000,1) - ){ - +createRandomSampleSettings <- function(n = 10000, + sampleSeed = sample(10000, 1)) { # add input checks - checkIsClass(n, c('numeric','integer')) - checkHigher(n,0) - checkIsClass(sampleSeed, c('numeric','integer')) - + checkIsClass(n, c("numeric", "integer")) + checkHigher(n, 0) + checkIsClass(sampleSeed, c("numeric", "integer")) + # create list of inputs to implement function sampleSettings <- list( n = n, - sampleSeed = sampleSeed - ) - + sampleSeed = sampleSeed + ) + # specify the function that will implement the sampling attr(sampleSettings, "fun") <- "implementRandomSampleSettings" # make sure the object returned is of class "sampleSettings" class(sampleSettings) <- "sampleSettings" return(sampleSettings) - } ``` @@ -126,48 +121,46 @@ In our example, the `createRandomSampleSettings()` will return a list with 'n' and 'sampleSeed'. The sampleSettings therefore contains these. ```{r tidy=FALSE,eval=FALSE} -implementRandomSampleSettings <- function(trainData, sampleSettings){ +implementRandomSampleSettings <- function(trainData, sampleSettings) { + n <- sampleSettings$n + sampleSeed <- sampleSettings$sampleSeed - n <- sampleSetting$n - sampleSeed <- sampleSetting$sampleSeed - - if(n > nrow(trainData$labels)){ - stop('Sample n bigger than training population') + if (n > nrow(trainData$labels)) { + stop("Sample n bigger than training population") } - + # set the seed for the randomization set.seed(sampleSeed) - + # now implement the code to do your desired sampling - + sampleRowIds <- sample(trainData$labels$rowId, n) - + sampleTrainData <- list() - - sampleTrainData$labels <- trainData$labels %>% - dplyr::filter(.data$rowId %in% sampleRowIds) %>% + + sampleTrainData$labels <- trainData$labels %>% + dplyr::filter(.data$rowId %in% sampleRowIds) %>% dplyr::collect() - - sampleTrainData$folds <- trainData$folds %>% - dplyr::filter(.data$rowId %in% sampleRowIds) %>% + + sampleTrainData$folds <- trainData$folds %>% + dplyr::filter(.data$rowId %in% sampleRowIds) %>% dplyr::collect() - + sampleTrainData$covariateData <- Andromeda::andromeda() - sampleTrainData$covariateData$covariateRef <-trainData$covariateData$covariateRef + sampleTrainData$covariateData$covariateRef <- trainData$covariateData$covariateRef sampleTrainData$covariateData$covariates <- trainData$covariateData$covariates %>% dplyr::filter(.data$rowId %in% sampleRowIds) - - #update metaData$populationSize - metaData <- attr(trainData$covariateData, 'metaData') - metaData$populationSize = n - attr(sampleTrainData$covariateData, 'metaData') <- metaData - + + # update metaData$populationSize + metaData <- attr(trainData$covariateData, "metaData") + metaData$populationSize <- n + attr(sampleTrainData$covariateData, "metaData") <- metaData + # make the cocvariateData the correct class - class(sampleTrainData$covariateData) <- 'CovariateData' - + class(sampleTrainData$covariateData) <- "CovariateData" + # return the updated trainData return(sampleTrainData) } - ``` # Acknowledgments diff --git a/vignettes/AddingCustomSplitting.Rmd b/vignettes/AddingCustomSplitting.Rmd index 96cef75d3..d0e318d1c 100644 --- a/vignettes/AddingCustomSplitting.Rmd +++ b/vignettes/AddingCustomSplitting.Rmd @@ -32,8 +32,7 @@ library(PatientLevelPrediction) # Introduction -This vignette describes how you can add your own custom function for splitting the labelled data into training data and validation data in the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf). - +This vignette describes how you can add your own custom function for splitting the labelled data into training data and validation data in the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the `vignette('BuildingPredictiveModels)`. **We invite you to share your new data splitting functions with the OHDSI community through our [GitHub repository](http://github.com/OHDSI/PatientLevelPrediction).** # Data Splitting Function Code Structure @@ -55,19 +54,16 @@ Let's consider the situation where we wish to create a split where females are u Our gender split function requires a single parameter, the number of folds used in cross validation. Therefore create a function with a single nfold input that returns a list of class 'splitSettings' with the 'fun' attribute specifying the 'implement' function we will use. ```{r, echo = TRUE, eval=FALSE} -createGenderSplit <- function(nfold) - { - +createGenderSplit <- function(nfold) { # create list of inputs to implement function splitSettings <- list(nfold = nfold) - + # specify the function that will implement the sampling attr(splitSettings, "fun") <- "implementGenderSplit" # make sure the object returned is of class "sampleSettings" class(splitSettings) <- "splitSettings" return(splitSettings) - } ``` @@ -80,24 +76,22 @@ All 'implement' functions for data splitting must take as input the population a The index is used to determine whether the patient (identifed by the rowId) is in the test set (index = -1) or train set (index \> 0). In in the train set, the value corresponds to the cross validation fold. For example, if rowId 2 is assigned index 5, then it means the patient with the rowId 2 is used to train the model and is in fold 5. ```{r tidy=FALSE,eval=FALSE} -implementGenderSplit <- function(population, splitSettings){ - +implementGenderSplit <- function(population, splitSettings) { # find the people who are male: males <- population$rowId[population$gender == 8507] females <- population$rowId[population$gender == 8532] - + splitIds <- data.frame( rowId = c(males, females), index = c( rep(-1, length(males)), - sample(1:splitSettings$nfold, length(females), replace = T) + sample(1:splitSettings$nfold, length(females), replace = TRUE) ) ) - + # return the updated trainData return(splitIds) } - ``` # Acknowledgments diff --git a/vignettes/BuildingMultiplePredictiveModels.Rmd b/vignettes/BuildingMultiplePredictiveModels.Rmd index d23c7326e..93206f3c8 100644 --- a/vignettes/BuildingMultiplePredictiveModels.Rmd +++ b/vignettes/BuildingMultiplePredictiveModels.Rmd @@ -34,7 +34,7 @@ output: In our [`paper`](https://academic.oup.com/jamia/article/25/8/969/4989437), we propose a standardised framework for patient-level prediction that utilizes the OMOP CDM and standardized vocabularies, and describe the open-source software that we developed implementing the framework’s pipeline. The framework is the first to enforce existing best practice guidelines and will enable open dissemination of models that can be extensively validated across the network of OHDSI collaborators. -One our best practices is that we see the selection of models and all study setting as an emperical question, i.e. we should use a data-driven approach in which we try many settings. This vignette describes how you can use the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package to automatically build multiple patient-level predictive models, e.g. different population settings, covariate settings, and modelsetting. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf). +One our best practices is that we see the selection of models and all study setting as an emperical question, i.e. we should use a data-driven approach in which we try many settings. This vignette describes how you can use the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package to automatically build multiple patient-level predictive models, e.g. different population settings, covariate settings, and modelsetting. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the `vignette('BuildingPredictiveModels')`. Note that it is also possible to generate a Study Package directly in Atlas that allows for multiple patient-level prediction analyses this is out-of-scope for this vignette. @@ -47,15 +47,15 @@ library(knitr) kable( data.frame( input = c( - 'targetId', - 'outcomeId', - 'restrictPlpDataSettings', - 'populationSettings', - 'covariateSettings', - 'sampleSettings', - 'featureEngineeringSettings', - 'preprocessSettings', - 'modelSettings' + "targetId", + "outcomeId", + "restrictPlpDataSettings", + "populationSettings", + "covariateSettings", + "sampleSettings", + "featureEngineeringSettings", + "preprocessSettings", + "modelSettings" ), Description = c( @@ -71,7 +71,7 @@ kable( ) ), - caption = 'The inputs for the model design' + caption = "The inputs for the model design" ) ``` @@ -83,28 +83,28 @@ For example, if we wanted to predict the outcome (id 2) occuring for the first t # Model 1 is only using data between 2018-2020: restrictPlpDataSettings <- createRestrictPlpDataSettings( - studyStartDate = '20180101', - studyEndDate = '20191231' + studyStartDate = "20180101", + studyEndDate = "20191231" ) # predict outcome within 1 to 180 days after index # remove people with outcome prior and with < 365 days observation populationSettings <- createStudyPopulationSettings( - binary = T, - firstExposureOnly = T, + binary = TRUE, + firstExposureOnly = TRUE, washoutPeriod = 365, - removeSubjectsWithPriorOutcome = T, + removeSubjectsWithPriorOutcome = TRUE, priorOutcomeLookback = 9999, - requireTimeAtRisk = F, + requireTimeAtRisk = FALSE, riskWindowStart = 1, riskWindowEnd = 180 ) # use age/gender in groups and condition groups as features covariateSettings <- FeatureExtraction::createCovariateSettings( - useDemographicsGender = T, - useDemographicsAgeGroup = T, - useConditionGroupEraAnyTimePrior = T + useDemographicsGender = TRUE, + useDemographicsAgeGroup = TRUE, + useConditionGroupEraAnyTimePrior = TRUE ) modelDesign1 <- createModelDesign( @@ -135,22 +135,22 @@ restrictPlpDataSettings <- createRestrictPlpDataSettings( # predict outcome within 1 to 730 days after index # remove people with outcome prior and with < 365 days observation populationSettings <- createStudyPopulationSettings( - binary = T, - firstExposureOnly = T, + binary = TRUE, + firstExposureOnly = TRUE, washoutPeriod = 365, - removeSubjectsWithPriorOutcome = T, + removeSubjectsWithPriorOutcome = TRUE, priorOutcomeLookback = 9999, - requireTimeAtRisk = F, + requireTimeAtRisk = FALSE, riskWindowStart = 1, riskWindowEnd = 730 ) # use age/gender in groups and condition/drug groups as features covariateSettings <- FeatureExtraction::createCovariateSettings( - useDemographicsGender = T, - useDemographicsAgeGroup = T, - useConditionGroupEraAnyTimePrior = T, - useDrugGroupEraAnyTimePrior = T + useDemographicsGender = TRUE, + useDemographicsAgeGroup = TRUE, + useConditionGroupEraAnyTimePrior = TRUE, + useDrugGroupEraAnyTimePrior = TRUE ) modelDesign2 <- createModelDesign( @@ -181,22 +181,22 @@ restrictPlpDataSettings <- createRestrictPlpDataSettings( # predict outcome during target cohort start/end # remove people with < 365 days observation populationSettings <- createStudyPopulationSettings( - binary = T, - firstExposureOnly = T, + binary = TRUE, + firstExposureOnly = TRUE, washoutPeriod = 365, - removeSubjectsWithPriorOutcome = F, - requireTimeAtRisk = F, + removeSubjectsWithPriorOutcome = FALSE, + requireTimeAtRisk = FALSE, riskWindowStart = 0, - startAnchor = 'cohort start', + startAnchor = "cohort start", riskWindowEnd = 0, - endAnchor = 'cohort end' + endAnchor = "cohort end" ) # use age/gender in groups and measurement indicators as features covariateSettings <- FeatureExtraction::createCovariateSettings( - useDemographicsGender = T, - useDemographicsAgeGroup = T, - useMeasurementAnyTimePrior = T, + useDemographicsGender = TRUE, + useDemographicsAgeGroup = TRUE, + useMeasurementAnyTimePrior = TRUE, endDays = -1 ) @@ -243,16 +243,16 @@ Next you need to specify the cdmDatabaseSchema where your cdm database is found cdmDatabaseSchema <- "your cdmDatabaseSchema" workDatabaseSchema <- "your workDatabaseSchema" cdmDatabaseName <- "your cdmDatabaseName" -cohortTable <- "your cohort table", +cohortTable <- "your cohort table" databaseDetails <- createDatabaseDetails( connectionDetails = connectionDetails, cdmDatabaseSchema = cdmDatabaseSchema, - cdmDatabaseName = cdmDatabaseName , + cdmDatabaseName = cdmDatabaseName, cohortDatabaseSchema = workDatabaseSchema, cohortTable = cohortTable, outcomeDatabaseSchema = workDatabaseSchema, - outcomeTable = cohortTable + outcomeTable = cohortTable, cdmVersion = 5 ) @@ -262,17 +262,16 @@ Now you can run the multiple patient-level prediction analysis: ```{r tidy=FALSE,eval=FALSE} results <- runMultiplePlp( - databaseDetails = databaseDetails, + databaseDetails = databaseDetails, modelDesignList = list( - modelDesign1, - modelDesign2, + modelDesign1, + modelDesign2, modelDesign3 - ), - onlyFetchData = F, - logSettings = createLogSettings(), - saveDirectory = "./PlpMultiOutput" - ) - + ), + onlyFetchData = FALSE, + logSettings = createLogSettings(), + saveDirectory = "./PlpMultiOutput" +) ``` This will then save all the plpData objects from the study into "./PlpMultiOutput/plpData_T1_L" and the results into "./PlpMultiOutput/Analysis\_". The csv file named settings.csv found in "./PlpMultiOutput" has a row for each prediction model developed and points to the plpData and settings used for the model development, it also has descriptions of the cohorts if these are input by the user. @@ -284,17 +283,16 @@ Note that if for some reason the run is interrupted, e.g. because of an error, a If you have access to multiple databases on the same server in different schemas you could evaluate accross these using this call: ```{r tidy=FALSE,eval=FALSE} - validationDatabaseDetails <- createDatabaseDetails( - connectionDetails = connectionDetails, - cdmDatabaseSchema = 'new cdm schema', - cdmDatabaseName = 'validation database', - cohortDatabaseSchema = workDatabaseSchema, - cohortTable = cohortTable, - outcomeDatabaseSchema = workDatabaseSchema, - outcomeTable = cohortTable, + connectionDetails = connectionDetails, + cdmDatabaseSchema = "new cdm schema", + cdmDatabaseName = "validation database", + cohortDatabaseSchema = workDatabaseSchema, + cohortTable = cohortTable, + outcomeDatabaseSchema = workDatabaseSchema, + outcomeTable = cohortTable, cdmVersion = 5 - ) +) val <- validateMultiplePlp( analysesLocation = "./PlpMultiOutput", @@ -302,8 +300,7 @@ val <- validateMultiplePlp( validationRestrictPlpDataSettings = createRestrictPlpDataSettings(), recalibrate = NULL, saveDirectory = "./PlpMultiOutput/Validation" - ) - +) ``` This then saves the external validation results in the `Validation` folder of the main study (the outputLocation you used in runPlpAnalyses). @@ -313,7 +310,7 @@ This then saves the external validation results in the `Validation` folder of th To view the results for the multiple prediction analysis: ```{r tidy=FALSE,eval=FALSE} -viewMultiplePlp(analysesLocation="./PlpMultiOutput") +viewMultiplePlp(analysesLocation = "./PlpMultiOutput") ``` If the validation directory in "./PlpMultiOutput" has a sqlite results database, the external validation will also be displayed. diff --git a/vignettes/BuildingPredictiveModels.Rmd b/vignettes/BuildingPredictiveModels.Rmd index b82e1e45c..9e5099cc2 100644 --- a/vignettes/BuildingPredictiveModels.Rmd +++ b/vignettes/BuildingPredictiveModels.Rmd @@ -33,10 +33,10 @@ output: library(PatientLevelPrediction) vignetteDataFolder <- "s:/temp/plpVignette" # Load all needed data if it exists on this computer: -if (file.exists(vignetteDataFolder)){ - plpModel <- loadPlpModel(file.path(vignetteDataFolder,'model')) - lrResults <- loadPlpModel(file.path(vignetteDataFolder,'results')) -} +if (file.exists(vignetteDataFolder)) { + plpModel <- loadPlpModel(file.path(vignetteDataFolder, "model")) + lrResults <- loadPlpModel(file.path(vignetteDataFolder, "results")) +} ``` ```{r, echo = FALSE, message = FALSE, warning = FALSE} @@ -61,7 +61,7 @@ As shown in Figure 2, to define a prediction problem we have to define t=0 by a ![Examples of prediction problems](problems.webp) -This vignette describes how you can use the `PatientLevelPrediction` package to build patient-level predictive models. The package enables data extraction, model building, and model evaluation using data from databases that are translated into the OMOP CDM. In this vignette we assume you have installed the package correctly using the [`InstallationGuide`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/InstallationGuide.pdf). +This vignette describes how you can use the `PatientLevelPrediction` package to build patient-level predictive models. The package enables data extraction, model building, and model evaluation using data from databases that are translated into the OMOP CDM. In this vignette we assume you have installed the package correctly using the `vignette('InstallationGuide')`. # Study specification @@ -103,12 +103,12 @@ The final study population in which we will develop our model is often a subset ## Model development settings -To develop the model we have to decide which algorithm(s) we like to train. We see the selection of the best algorithm for a certain prediction problem as an empirical question, i.e. you need to let the data speak for itself and try different approaches to find the best one. There is no algorithm that will work best for all problems (no free lunch). In our package we therefore aim to implement many algorithms. Furthermore, we made the system modular so you can add your own custom algorithms as described in more detail in the [`AddingCustomModels`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomModels.pdf) vignette. +To develop the model we have to decide which algorithm(s) we like to train. We see the selection of the best algorithm for a certain prediction problem as an empirical question, i.e. you need to let the data speak for itself and try different approaches to find the best one. There is no algorithm that will work best for all problems (no free lunch). In our package we therefore aim to implement many algorithms. Furthermore, we made the system modular so you can add your own custom algorithms as described in more detail in `vignette('AddingCustomModels')`. Our package currently contains the following algorithms to choose from: ```{r table2, echo=FALSE, message=FALSE, warnings=FALSE, results='asis'} -tabl <- " +tabl <- " | Algorihm | Description | Hyper-parameters | | ----------| ---------------------------------------------------| ----------------------- | | Regularized Logistic Regression | Lasso logistic regression belongs to the family of generalized linear models, where a linear combination of the variables is learned and finally a logistic function maps the linear combination to a value between 0 and 1. The lasso regularization adds a cost based on model complexity to the objective function when training the model. This cost is the sum of the absolute values of the linear combination of the coefficients. The model automatically performs feature selection by minimizing this cost. We use the Cyclic coordinate descent for logistic, Poisson and survival analysis (Cyclops) package to perform large-scale regularized logistic regression: https://github.com/OHDSI/Cyclops | var (starting variance), seed | @@ -314,29 +314,32 @@ This is parameterized SQL which can be used by the [`SqlRender`](http://github.c To execute this `sql` against our CDM we first need to tell R how to connect to the server. `PatientLevelPrediction` uses the [`DatabaseConnector`](http://github.com/ohdsi/DatabaseConnector) package, which provides a function called `createConnectionDetails()`. Type `?createConnectionDetails` for the specific settings required for the various database management systems (DBMS). For example, one might connect to a PostgreSQL database using this code: ```{r tidy=FALSE,eval=FALSE} - library(DatabaseConnector) - connectionDetails <- createConnectionDetails(dbms = "postgresql", - server = "localhost/ohdsi", - user = "joe", - password = "supersecret") - - cdmDatabaseSchema <- "cdm" - cohortsDatabaseSchema <- "cohorts" - cdmVersion <- "5" +library(DatabaseConnector) +connectionDetails <- createConnectionDetails( + dbms = "postgresql", + server = "localhost/ohdsi", + user = "joe", + password = "supersecret" +) + +cdmDatabaseSchema <- "cdm" +cohortsDatabaseSchema <- "cohorts" +cdmVersion <- "5" ``` The last three lines define the `cdmDatabaseSchema` and `cohortsDatabaseSchema` variables, as well as the CDM version. We will use these later to tell R where the data in CDM format live, where we want to create the cohorts of interest, and what version CDM is used. Note that for Microsoft SQL Server, you need to specify both the database and the schema, so for example `cdmDatabaseSchema <- "my_cdm_data.dbo"`. ```{r tidy=FALSE,eval=FALSE} - library(SqlRender) - sql <- readSql("AfStrokeCohorts.sql") - sql <- render(sql, - cdmDatabaseSchema = cdmDatabaseSchema, - cohortsDatabaseSchema = cohortsDatabaseSchema) - sql <- translate(sql, targetDialect = connectionDetails$dbms) - - connection <- connect(connectionDetails) - executeSql(connection, sql) +library(SqlRender) +sql <- readSql("AfStrokeCohorts.sql") +sql <- render(sql, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortsDatabaseSchema = cohortsDatabaseSchema +) +sql <- translate(sql, targetDialect = connectionDetails$dbms) + +connection <- connect(connectionDetails) +executeSql(connection, sql) ``` In this code, we first read the SQL from the file into memory. In the next line, we replace four parameter names with the actual values. We then translate the SQL into the dialect appropriate for the DBMS we already specified in the `connectionDetails`. Next, we connect to the server, and submit the rendered and translated SQL. @@ -344,17 +347,19 @@ In this code, we first read the SQL from the file into memory. In the next line, If all went well, we now have a table with the events of interest. We can see how many events per type: ```{r tidy=FALSE,eval=FALSE} - sql <- paste("SELECT cohort_definition_id, COUNT(*) AS count", +sql <- paste( + "SELECT cohort_definition_id, COUNT(*) AS count", "FROM @cohortsDatabaseSchema.AFibStrokeCohort", - "GROUP BY cohort_definition_id") - sql <- render(sql, cohortsDatabaseSchema = cohortsDatabaseSchema) - sql <- translate(sql, targetDialect = connectionDetails$dbms) - - querySql(connection, sql) + "GROUP BY cohort_definition_id" +) +sql <- render(sql, cohortsDatabaseSchema = cohortsDatabaseSchema) +sql <- translate(sql, targetDialect = connectionDetails$dbms) + +querySql(connection, sql) ``` ```{r echo=FALSE,message=FALSE} -data.frame(cohort_definition_id = c(1, 2),count = c(527616, 221555)) +data.frame(cohort_definition_id = c(1, 2), count = c(527616, 221555)) ``` ### Study script creation @@ -366,16 +371,18 @@ In this section we assume that our cohorts have been created either by using ATL Now we can tell `PatientLevelPrediction` to extract all necessary data for our analysis. This is done using the [`FeatureExtraction`](https://github.com/OHDSI/FeatureExtraction) package. In short the `FeatureExtraction` package allows you to specify which features (`covariates`) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom `covariates`. For more detailed information on the `FeatureExtraction` package see its [vignettes](https://github.com/OHDSI/FeatureExtraction). For our example study we decided to use these settings: ```{r tidy=FALSE,eval=FALSE} - library(FeatureExtraction) - covariateSettings <- createCovariateSettings(useDemographicsGender = TRUE, - useDemographicsAge = TRUE, - useConditionGroupEraLongTerm = TRUE, - useConditionGroupEraAnyTimePrior = TRUE, - useDrugGroupEraLongTerm = TRUE, - useDrugGroupEraAnyTimePrior = TRUE, - useVisitConceptCountLongTerm = TRUE, - longTermStartDays = -365, - endDays = -1) +library(FeatureExtraction) +covariateSettings <- createCovariateSettings( + useDemographicsGender = TRUE, + useDemographicsAge = TRUE, + useConditionGroupEraLongTerm = TRUE, + useConditionGroupEraAnyTimePrior = TRUE, + useDrugGroupEraLongTerm = TRUE, + useDrugGroupEraAnyTimePrior = TRUE, + useVisitConceptCountLongTerm = TRUE, + longTermStartDays = -365, + endDays = -1 +) ``` The final step for extracting the data is to run the `getPlpData()` function and input the connection details, the database schema where the cohorts are stored, the cohort definition ids for the cohort and outcome, and the `washoutPeriod` which is the minimum number of days prior to cohort index date that the person must have been observed to be included into the data, and finally input the previously constructed `covariateSettings`. @@ -385,25 +392,26 @@ library(PatientLevelPrediction) databaseDetails <- createDatabaseDetails( connectionDetails = connectionDetails, cdmDatabaseSchema = cdmDatabaseSchema, - cdmDatabaseName = '', + cdmDatabaseName = "", cohortDatabaseSchema = cohortsDatabaseSchema, - cohortTable = 'AFibStrokeCohort', + cohortTable = "AFibStrokeCohort", targetId = 1, outcomeDatabaseSchema = cohortsDatabaseSchema, - outcomeTable = 'AFibStrokeCohort', + outcomeTable = "AFibStrokeCohort", outcomeIds = 2, cdmVersion = 5 - ) +) # here you can define whether you want to sample the target cohort and add any # restrictions based on minimum prior observation, index date restrictions # or restricting to first index date (if people can be in target cohort multiple times) restrictPlpDataSettings <- createRestrictPlpDataSettings(sampleSize = 10000) -plpData <- getPlpData(databaseDetails = databaseDetails, - covariateSettings = covariateSettings, - restrictPlpDataSettings = restrictPlpDataSettings - ) +plpData <- getPlpData( + databaseDetails = databaseDetails, + covariateSettings = covariateSettings, + restrictPlpDataSettings = restrictPlpDataSettings +) ``` Note that if the cohorts are created in ATLAS its corresponding cohort database schema needs to be selected. There are many additional parameters for the `getPlpData()` function which are all documented in the `PatientLevelPrediction` manual. The resulting `plpData` object uses the package `Andromeda` (which uses [SQLite](https://www.sqlite.org/index.html)) to store information in a way that ensures R does not run out of memory, even when the data are large. @@ -411,7 +419,7 @@ Note that if the cohorts are created in ATLAS its corresponding cohort database Creating the `plpData` object can take considerable computing time, and it is probably a good idea to save it for future sessions. Because `plpData` uses `Andromeda`, we cannot use R's regular save function. Instead, we'll have to use the `savePlpData()` function: ```{r tidy=TRUE,eval=FALSE} - savePlpData(plpData, "stroke_in_af_data") +savePlpData(plpData, "stroke_in_af_data") ``` We can use the `loadPlpData()` function to load the data in a future session. @@ -423,19 +431,19 @@ To completely define the prediction problem the final study population is obtain In the example below all the settings we defined for our study are imposed: ```{r tidy=FALSE,eval=FALSE} - populationSettings <- createStudyPopulationSettings( +populationSettings <- createStudyPopulationSettings( washoutPeriod = 1095, firstExposureOnly = FALSE, removeSubjectsWithPriorOutcome = FALSE, priorOutcomeLookback = 1, riskWindowStart = 1, riskWindowEnd = 365, - startAnchor = 'cohort start', - endAnchor = 'cohort start', + startAnchor = "cohort start", + endAnchor = "cohort start", minTimeAtRisk = 364, requireTimeAtRisk = TRUE, includeAllOutcomes = TRUE - ) +) ``` ### Splitting the data into training/validation/testing datasets @@ -447,13 +455,13 @@ In small data the best approach for internal validation has been shown to be boo In the `PatientLevelPrediction` package, the `splitSettings` define how the `plpData` are partitioned into training/validation/testing data. They are created with `createDefaultSplitSetting()`. Cross validation is always done, but using a test set is optional (when the data are small, it may be optimal to not use a test set). For the `splitSettings` we can use the type (`stratified`/`time`/`subject`) and `testFraction` parameters to split the data in a 75%-25% split and run the patient-level prediction pipeline: ```{r tidy=FALSE,eval=FALSE} - splitSettings <- createDefaultSplitSetting( - trainFraction = 0.75, - testFraction = 0.25, - type = 'stratified', - nfold = 2, - splitSeed = 1234 - ) +splitSettings <- createDefaultSplitSetting( + trainFraction = 0.75, + testFraction = 0.25, + type = "stratified", + nfold = 2, + splitSeed = 1234 +) ``` Note: it is possible to add a custom method to specify how the `plpData` are partitioned into training/validation/testing data, see `vignette('AddingCustomSplitting')` @@ -465,27 +473,27 @@ There a numerous data processing settings that a user must specify when developi The default sample settings does nothing, it simply returns the trainData as input, see below: ```{r tidy=FALSE,eval=FALSE} - sampleSettings <- createSampleSettings() +sampleSettings <- createSampleSettings() ``` -However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the `type` input should be 'underSample' and `numberOutcomestoNonOutcomes` must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see [vignette for custom sampling](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSamples.pdf). +However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the `type` input should be 'underSample' and `numberOutcomestoNonOutcomes` must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see `vignette('AddingCustomSamples')`. It is possible to specify a combination of feature engineering functions that take as input the trainData and output a new trainData with different features. The default feature engineering setting does nothing: ```{r tidy=FALSE,eval=FALSE} - featureEngineeringSettings <- createFeatureEngineeringSettings() +featureEngineeringSettings <- createFeatureEngineeringSettings() ``` -However, it is possible to add custom feature engineering functions into the pipeline, see [vignette for custom feature engineering](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomFeatureEngineering.pdf). +However, it is possible to add custom feature engineering functions into the pipeline, see `vignette('AddingCustomFeatureEngineering')`. Finally, the preprocessing setting is required. For this setting the user can define `minFraction`, this removes any features that is observed in the training data for less than 0.01 fraction of the patients. So, if `minFraction = 0.01` then any feature that is seen in less than 1 percent of the target population is removed. The input `normalize` specifies whether the features are scaled between 0 and 1, this is required for certain models (e.g., LASSO logistic regression). The input `removeRedundancy` specifies whether features that are observed in all of the target population are removed. ```{r tidy=FALSE,eval=FALSE} - preprocessSettingsSettings <- createPreprocessSettings( - minFraction = 0.01, - normalize = T, - removeRedundancy = T - ) +preprocessSettingsSettings <- createPreprocessSettings( + minFraction = 0.01, + normalize = TRUE, + removeRedundancy = TRUE +) ``` ### Model Development @@ -501,28 +509,28 @@ lrModel <- setLassoLogisticRegression() The `runPlP` function requires the `plpData`, the `outcomeId` specifying the outcome being predicted and the settings: `populationSettings`, `splitSettings`, `sampleSettings`, `featureEngineeringSettings`, `preprocessSettings` and `modelSettings` to train and evaluate the model. ```{r tidy=FALSE,eval=FALSE} - lrResults <- runPlp( - plpData = plpData, - outcomeId = 2, - analysisId = 'singleDemo', - analysisName = 'Demonstration of runPlp for training single PLP models', - populationSettings = populationSettings, - splitSettings = splitSettings, - sampleSettings = sampleSettings, - featureEngineeringSettings = featureEngineeringSettings, - preprocessSettings = preprocessSettings, - modelSettings = lrModel, - logSettings = createLogSettings(), - executeSettings = createExecuteSettings( - runSplitData = T, - runSampleData = T, - runfeatureEngineering = T, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = T - ), - saveDirectory = file.path(getwd(), 'singlePlp') - ) +lrResults <- runPlp( + plpData = plpData, + outcomeId = 2, + analysisId = "singleDemo", + analysisName = "Demonstration of runPlp for training single PLP models", + populationSettings = populationSettings, + splitSettings = splitSettings, + sampleSettings = sampleSettings, + featureEngineeringSettings = featureEngineeringSettings, + preprocessSettings = preprocessSettings, + modelSettings = lrModel, + logSettings = createLogSettings(), + executeSettings = createExecuteSettings( + runSplitData = TRUE, + runSampleData = TRUE, + runfeatureEngineering = TRUE, + runPreprocessData = TRUE, + runModelDevelopment = TRUE, + runCovariateSummary = TRUE + ), + saveDirectory = file.path(getwd(), "singlePlp") +) ``` Under the hood the package will now use the [`Cyclops`](www.github.com/OHDSI/Cyclops) package to fit a large-scale regularized regression using 75% of the data and will evaluate the model on the remaining 25%. A results data structure is returned containing information about the model, its performance etc. @@ -536,19 +544,19 @@ savePlpModel(lrResults$model, dirPath = file.path(getwd(), "model")) You can load the model using: ```{r tidy=TRUE,eval=FALSE} - plpModel <- loadPlpModel(file.path(getwd(),'model')) +plpModel <- loadPlpModel(file.path(getwd(), "model")) ``` You can also save the full results structure using: ```{r tidy=TRUE,eval=FALSE} - savePlpResult(lrResults, location = file.path(getwd(),'lr')) +savePlpResult(lrResults, location = file.path(getwd(), "lr")) ``` To load the full results structure use: ```{r tidy=TRUE,eval=FALSE} - lrResults <- loadPlpResult(file.path(getwd(),'lr')) +lrResults <- loadPlpResult(file.path(getwd(), "lr")) ``` \newpage @@ -705,28 +713,31 @@ This is parameterized SQL which can be used by the [`SqlRender`](http://github.c To execute this sql against our CDM we first need to tell R how to connect to the server. `PatientLevelPrediction` uses the [`DatabaseConnector`](http://github.com/ohdsi/DatabaseConnector) package, which provides a function called `createConnectionDetails`. Type `?createConnectionDetails` for the specific settings required for the various database management systems (DBMS). For example, one might connect to a PostgreSQL database using this code: ```{r tidy=FALSE,eval=FALSE} - connectionDetails <- createConnectionDetails(dbms = "postgresql", - server = "localhost/ohdsi", - user = "joe", - password = "supersecret") - - cdmDatabaseSchema <- "my_cdm_data" - cohortsDatabaseSchema <- "my_results" - cdmVersion <- "5" +connectionDetails <- createConnectionDetails( + dbms = "postgresql", + server = "localhost/ohdsi", + user = "joe", + password = "supersecret" +) + +cdmDatabaseSchema <- "my_cdm_data" +cohortsDatabaseSchema <- "my_results" +cdmVersion <- "5" ``` The last three lines define the `cdmDatabaseSchema` and `cohortsDatabaseSchema` variables, as well as the CDM version. We will use these later to tell R where the data in CDM format live, where we want to create the cohorts of interest, and what version CDM is used. Note that for Microsoft SQL Server, databaseschemas need to specify both the database and the schema, so for example `cdmDatabaseSchema <- "my_cdm_data.dbo"`. ```{r tidy=FALSE,eval=FALSE} - library(SqlRender) - sql <- readSql("AceAngioCohorts.sql") - sql <- render(sql, - cdmDatabaseSchema = cdmDatabaseSchema, - cohortsDatabaseSchema = cohortsDatabaseSchema) - sql <- translate(sql, targetDialect = connectionDetails$dbms) - - connection <- connect(connectionDetails) - executeSql(connection, sql) +library(SqlRender) +sql <- readSql("AceAngioCohorts.sql") +sql <- render(sql, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortsDatabaseSchema = cohortsDatabaseSchema +) +sql <- translate(sql, targetDialect = connectionDetails$dbms) + +connection <- connect(connectionDetails) +executeSql(connection, sql) ``` In this code, we first read the SQL from the file into memory. In the next line, we replace four parameter names with the actual values. We then translate the SQL into the dialect appropriate for the DBMS we already specified in the `connectionDetails`. Next, we connect to the server, and submit the rendered and translated SQL. @@ -734,17 +745,19 @@ In this code, we first read the SQL from the file into memory. In the next line, If all went well, we now have a table with the events of interest. We can see how many events per type: ```{r tidy=FALSE,eval=FALSE} - sql <- paste("SELECT cohort_definition_id, COUNT(*) AS count", - "FROM @cohortsDatabaseSchema.AceAngioCohort", - "GROUP BY cohort_definition_id") - sql <- render(sql, cohortsDatabaseSchema = cohortsDatabaseSchema) - sql <- translate(sql, targetDialect = connectionDetails$dbms) - - querySql(connection, sql) +sql <- paste( + "SELECT cohort_definition_id, COUNT(*) AS count", + "FROM @cohortsDatabaseSchema.AceAngioCohort", + "GROUP BY cohort_definition_id" +) +sql <- render(sql, cohortsDatabaseSchema = cohortsDatabaseSchema) +sql <- translate(sql, targetDialect = connectionDetails$dbms) + +querySql(connection, sql) ``` ```{r echo=FALSE,message=FALSE} - data.frame(cohort_definition_id = c(1, 2),count = c(0, 0)) +data.frame(cohort_definition_id = c(1, 2), count = c(0, 0)) ``` ### Study script creation @@ -756,42 +769,42 @@ In this section we assume that our cohorts have been created either by using ATL Now we can tell `PatientLevelPrediction` to extract all necessary data for our analysis. This is done using the [`FeatureExtractionPackage`](https://github.com/OHDSI/FeatureExtraction). In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its [vignettes](https://github.com/OHDSI/FeatureExtraction). For our example study we decided to use these settings: ```{r tidy=FALSE,eval=FALSE} - covariateSettings <- createCovariateSettings(useDemographicsGender = TRUE, - useDemographicsAge = TRUE, - useConditionGroupEraLongTerm = TRUE, - useConditionGroupEraAnyTimePrior = TRUE, - useDrugGroupEraLongTerm = TRUE, - useDrugGroupEraAnyTimePrior = TRUE, - useVisitConceptCountLongTerm = TRUE, - longTermStartDays = -365, - endDays = -1) +covariateSettings <- createCovariateSettings( + useDemographicsGender = TRUE, + useDemographicsAge = TRUE, + useConditionGroupEraLongTerm = TRUE, + useConditionGroupEraAnyTimePrior = TRUE, + useDrugGroupEraLongTerm = TRUE, + useDrugGroupEraAnyTimePrior = TRUE, + useVisitConceptCountLongTerm = TRUE, + longTermStartDays = -365, + endDays = -1 +) ``` The final step for extracting the data is to run the `getPlpData` function and input the connection details, the database schema where the cohorts are stored, the cohort definition ids for the cohort and outcome, and the washoutPeriod which is the minimum number of days prior to cohort index date that the person must have been observed to be included into the data, and finally input the previously constructed covariate settings. ```{r tidy=FALSE,eval=FALSE} - databaseDetails <- createDatabaseDetails( connectionDetails = connectionDetails, cdmDatabaseSchema = cdmDatabaseSchema, cohortDatabaseSchema = resultsDatabaseSchema, - cohortTable = 'AceAngioCohort', + cohortTable = "AceAngioCohort", cohortId = 1, outcomeDatabaseSchema = resultsDatabaseSchema, - outcomeTable = 'AceAngioCohort', + outcomeTable = "AceAngioCohort", outcomeIds = 2 - ) +) restrictPlpDataSettings <- createRestrictPlpDataSettings( sampleSize = 10000 - ) +) plpData <- getPlpData( - databaseDetails = databaseDetails, - covariateSettings = covariateSettings, + databaseDetails = databaseDetails, + covariateSettings = covariateSettings, restrictPlpDataSettings = restrictPlpDataSettings - ) - +) ``` Note that if the cohorts are created in ATLAS its corresponding cohort database schema needs to be selected. There are many additional parameters for the `getPlpData` function which are all documented in the `PatientLevelPrediction` manual. The resulting `plpData` object uses the package `ff` to store information in a way that ensures R does not run out of memory, even when the data are large. @@ -799,7 +812,7 @@ Note that if the cohorts are created in ATLAS its corresponding cohort database Creating the `plpData` object can take considerable computing time, and it is probably a good idea to save it for future sessions. Because `plpData` uses `ff`, we cannot use R's regular save function. Instead, we'll have to use the `savePlpData()` function: ```{r tidy=TRUE,eval=FALSE} - savePlpData(plpData, "angio_in_ace_data") +savePlpData(plpData, "angio_in_ace_data") ``` We can use the `loadPlpData()` function to load the data in a future session. @@ -813,19 +826,19 @@ In Appendix 1, we demonstrate the effect of these settings on the subset of the In the example below all the settings we defined for our study are imposed: ```{r tidy=FALSE,eval=FALSE} - populationSettings <- createStudyPopulationSettings( - washoutPeriod = 364, - firstExposureOnly = FALSE, - removeSubjectsWithPriorOutcome = TRUE, - priorOutcomeLookback = 9999, - riskWindowStart = 1, - riskWindowEnd = 365, - minTimeAtRisk = 364, - startAnchor = 'cohort start', - endAnchor = 'cohort start', - requireTimeAtRisk = TRUE, - includeAllOutcomes = TRUE - ) +populationSettings <- createStudyPopulationSettings( + washoutPeriod = 364, + firstExposureOnly = FALSE, + removeSubjectsWithPriorOutcome = TRUE, + priorOutcomeLookback = 9999, + riskWindowStart = 1, + riskWindowEnd = 365, + minTimeAtRisk = 364, + startAnchor = "cohort start", + endAnchor = "cohort start", + requireTimeAtRisk = TRUE, + includeAllOutcomes = TRUE +) ``` ### Spliting the data into training/validation/testing datasets @@ -837,16 +850,16 @@ In small data the best approach for internal validation has been shown to be boo In the PatientLevelPrediction package, the splitSettings define how the plpData are partitioned into training/validation/testing data. Cross validation is always done, but using a test set is optional (when the data are small, it may be optimal to not use a test set). For the splitSettings we can use the type (stratified/time/subject) and testFraction parameters to split the data in a 75%-25% split and run the patient-level prediction pipeline: ```{r tidy=FALSE,eval=FALSE} - splitSettings <- createDefaultSplitSetting( - trainFraction = 0.75, - testFraction = 0.25, - type = 'stratified', - nfold = 2, - splitSeed = 1234 - ) +splitSettings <- createDefaultSplitSetting( + trainFraction = 0.75, + testFraction = 0.25, + type = "stratified", + nfold = 2, + splitSeed = 1234 +) ``` -Note: it is possible to add a custom method to specify how the plpData are partitioned into training/validation/testing data, see [vignette for custom splitting](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSplitting.pdf). +Note: it is possible to add a custom method to specify how the plpData are partitioned into training/validation/testing data, see `vignette('AddingCustomSplitting')`. ### Preprocessing the training data @@ -855,27 +868,27 @@ There a numerous data processing settings that a user must specify when developi The default sample settings does nothing, it simply returns the trainData as input, see below: ```{r tidy=FALSE,eval=FALSE} - sampleSettings <- createSampleSettings() +sampleSettings <- createSampleSettings() ``` -However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the `type` input should be 'underSample' and `numberOutcomestoNonOutcomes` must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see [vignette for custom sampling](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSamples.pdf). +However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the `type` input should be 'underSample' and `numberOutcomestoNonOutcomes` must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see `vignette('AddingCustomSamples')`. It is possible to specify a combination of feature engineering functions that take as input the trainData and output a new trainData with different features. The default feature engineering setting does nothing: ```{r tidy=FALSE,eval=FALSE} - featureEngineeringSettings <- createFeatureEngineeringSettings() +featureEngineeringSettings <- createFeatureEngineeringSettings() ``` -However, it is possible to add custom feature engineering functions into the pipeline, see [vignette for custom feature engineering](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomFeatureEngineering.pdf). +However, it is possible to add custom feature engineering functions into the pipeline, see `vignette('AddingCustomFeatureEngineering')`. Finally, the preprocessing setting is required. For this setting the user can define `minFraction`, this removes any features that is observed in the training data for less than 0.01 fraction of the patients. So, if `minFraction = 0.01` then any feature that is seen in less than 1 percent of the target population is removed. The input `normalize` specifies whether the features are scaled between 0 and 1, this is required for certain models (e.g., LASSO logistic regression). The input `removeRedundancy` specifies whether features that are observed in all of the target population are removed. ```{r tidy=FALSE,eval=FALSE} - preprocessSettingsSettings <- createPreprocessSettings( - minFraction = 0.01, - normalize = T, - removeRedundancy = T - ) +preprocessSettingsSettings <- createPreprocessSettings( + minFraction = 0.01, + normalize = TRUE, + removeRedundancy = TRUE +) ``` ### Model Development @@ -885,38 +898,38 @@ In the set function of an algorithm the user can specify a list of eligible valu For example, if we use the following settings for the gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search will apply the gradient boosting machine algorithm with ntrees=100 and maxDepth=4 plus the default settings for other hyper-parameters and ntrees=200 and maxDepth=4 plus the default settings for other hyper-parameters. The hyper-parameters that lead to the bestcross-validation performance will then be chosen for the final model. For our problem we choose to build a logistic regression model with the default hyper-parameters ```{r tidy=TRUE,eval=FALSE} - gbmModel <- setGradientBoostingMachine( - ntrees = 5000, - maxDepth = c(4,7,10), - learnRate = c(0.001,0.01,0.1,0.9) - ) +gbmModel <- setGradientBoostingMachine( + ntrees = 5000, + maxDepth = c(4, 7, 10), + learnRate = c(0.001, 0.01, 0.1, 0.9) +) ``` The `runPlP` function requires the `plpData`, the `outcomeId` specifying the outcome being predicted and the settings: `populationSettings`, `splitSettings`, `sampleSettings`, `featureEngineeringSettings`, `preprocessSettings` and `modelSettings` to train and evaluate the model. ```{r tidy=FALSE,eval=FALSE} - gbmResults <- runPlp( - plpData = plpData, - outcomeId = 2, - analysisId = 'singleDemo2', - analysisName = 'Demonstration of runPlp for training single PLP models', - populationSettings = populationSettings, - splitSettings = splitSettings, - sampleSettings = sampleSettings, - featureEngineeringSettings = featureEngineeringSettings, - preprocessSettings = preprocessSettings, - modelSettings = gbmModel, - logSettings = createLogSettings(), - executeSettings = createExecuteSettings( - runSplitData = T, - runSampleData = T, - runfeatureEngineering = T, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = T - ), - saveDirectory = file.path(getwd(), 'singlePlpExample2') - ) +gbmResults <- runPlp( + plpData = plpData, + outcomeId = 2, + analysisId = "singleDemo2", + analysisName = "Demonstration of runPlp for training single PLP models", + populationSettings = populationSettings, + splitSettings = splitSettings, + sampleSettings = sampleSettings, + featureEngineeringSettings = featureEngineeringSettings, + preprocessSettings = preprocessSettings, + modelSettings = gbmModel, + logSettings = createLogSettings(), + executeSettings = createExecuteSettings( + runSplitData = TRUE, + runSampleData = TRUE, + runfeatureEngineering = TRUE, + runPreprocessData = TRUE, + runModelDevelopment = TRUE, + runCovariateSummary = TRUE + ), + saveDirectory = file.path(getwd(), "singlePlpExample2") +) ``` Under the hood the package will now use the R xgboost package to fit a a gradient boosting machine model using 75% of the data and will evaluate the model on the remaining 25%. A results data structure is returned containing information about the model, its performance etc. @@ -924,25 +937,25 @@ Under the hood the package will now use the R xgboost package to fit a a gradien You can save the model using: ```{r tidy=TRUE,eval=FALSE} - savePlpModel(gbmResults$model, dirPath = file.path(getwd(), "model")) +savePlpModel(gbmResults$model, dirPath = file.path(getwd(), "model")) ``` You can load the model using: ```{r tidy=TRUE,eval=FALSE} - plpModel <- loadPlpModel(file.path(getwd(),'model')) +plpModel <- loadPlpModel(file.path(getwd(), "model")) ``` You can also save the full results structure using: ```{r tidy=TRUE,eval=FALSE} - savePlpResult(gbmResults, location = file.path(getwd(),'gbm')) +savePlpResult(gbmResults, location = file.path(getwd(), "gbm")) ``` To load the full results structure use: ```{r tidy=TRUE,eval=FALSE} - gbmResults <- loadPlpResult(file.path(getwd(),'gbm')) +gbmResults <- loadPlpResult(file.path(getwd(), "gbm")) ``` \newpage @@ -1040,7 +1053,7 @@ Furthermore, many interactive plots are available in the Shiny App, for example To generate and save all the evaluation plots to a folder run the following code: ```{r tidy=TRUE,eval=FALSE} -plotPlp(lrResults, dirPath=getwd()) +plotPlp(lrResults, dirPath = getwd()) ``` The plots are described in more detail in the next sections. @@ -1149,22 +1162,22 @@ The plot shows that the mean of most of the covariates is higher for subjects wi \## Precision recall -Precision (P) is defined as the number of true positives (Tp) over the number of true positives plus the number of false positives (Fp). +Precision is defined as the number of true positives (Tp) over the number of true positives plus the number of false positives (Fp). ```{r tidy=TRUE,eval=FALSE} -P <- Tp/(Tp+Fp) +precision <- Tp / (Tp + Fp) ``` -Recall (R) is defined as the number of true positives (Tp) over the number of true positives plus the number of false negatives (Fn). +Recall is defined as the number of true positives (Tp) over the number of true positives plus the number of false negatives (Fn). ```{r tidy=TRUE,eval=FALSE} -R <- Tp/(Tp + Fn) +recall <- Tp / (Tp + Fn) ``` These quantities are also related to the (F1) score, which is defined as the harmonic mean of precision and recall. ```{r tidy=TRUE,eval=FALSE} -F1 <- 2*P*R/(P+R) +f1Score <- 2 * P * R / (P + R) ``` Note that the precision can either decrease or increase if the threshold is lowered. Lowering the threshold of a classifier may increase the denominator, by increasing the number of results returned. If the threshold was previously set too high, the new results may all be true positives, which will increase precision. If the previous threshold was about right or too low, further lowering the threshold will introduce false positives, decreasing precision. @@ -1199,7 +1212,7 @@ We recommend to always perform external validation, i.e. apply the final model o ```{r tidy=FALSE,eval=FALSE} # load the trained model -plpModel <- loadPlpModel(getwd(),'model') +plpModel <- loadPlpModel(getwd(), "model") # add details of new database validationDatabaseDetails <- createDatabaseDetails() @@ -1210,11 +1223,10 @@ externalValidateDbPlp( validationDatabaseDetails = validationDatabaseDetails, validationRestrictPlpDataSettings = plpModel$settings$plpDataSettings, settings = createValidationSettings( - recalibrate = 'weakRecalibration' - ), + recalibrate = "weakRecalibration" + ), outputFolder = getwd() ) - ``` This will extract the new plpData from the specified schemas and cohort tables. It will then apply the same population settings and the trained plp model. Finally, it will evaluate the performance and return the standard output as `validation$performanceEvaluation` and it will also return the prediction on the population as `validation$prediction`. They can be inserted into the shiny app for viewing the model and validation by running: `viewPlp(runPlp=plpResult, validatePlp=validation )`. @@ -1227,19 +1239,19 @@ The package has much more functionality than described in this vignette and cont | Functionality | Description | Vignette | |------------------------|------------------------|------------------------| -| Builing Multiple Models | This vignette describes how you can run multiple models automatically | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingMultiplePredictiveModels.pdf) | -| Custom Models | This vignette describes how you can add your own custom algorithms in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomModels.pdf) | -| Custom Splitting Functions | This vignette describes how you can add your own custom training/validation/testing splitting functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSplitting.pdf) | -| Custom Sampling Functions | This vignette describes how you can add your own custom sampling functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSamples.pdf) | -| Custom Feature Engineering/Selection | This vignette describes how you can add your own custom feature engineering and selection functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomFeatureEngineering.pdf) | -| Learning curves | Learning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below. | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/CreatingLearningCurves.pdf) | +| Builing Multiple Models | This vignette describes how you can run multiple models automatically | `vignette('BuildingMultiplePredictiveModels')` | +| Custom Models | This vignette describes how you can add your own custom algorithms in the framework | `vignette('AddingCustomModels')` | +| Custom Splitting Functions | This vignette describes how you can add your own custom training/validation/testing splitting functions in the framework | `vignette('AddingCustomSplitting')` | +| Custom Sampling Functions | This vignette describes how you can add your own custom sampling functions in the framework | `vignette('AddingCustomSamples')` | +| Custom Feature Engineering/Selection | This vignette describes how you can add your own custom feature engineering and selection functions in the framework | `vignette('AddingCustomFeatureEngineering')` | +| Learning curves | Learning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below. | `vignette('CreatingLearningCurves')` | # Demos We have added several demos in the package that run on simulated data: ```{r eval=FALSE} -# Show all demos in our package: +# Show all demos in our package: demo(package = "PatientLevelPrediction") # For example, to run the SingleModelDemo that runs Lasso and shows you how to run the Shiny App use this call diff --git a/vignettes/ConstrainedPredictors.Rmd b/vignettes/ConstrainedPredictors.Rmd index 226bbca06..420f218ac 100644 --- a/vignettes/ConstrainedPredictors.Rmd +++ b/vignettes/ConstrainedPredictors.Rmd @@ -39,7 +39,7 @@ Here we provide a set of phenotypes that can be used as predictors in prediction These phenotypes can be extracted from the PhenotypeLibrary R package. To install the R package run: ```{r, echo = TRUE, message = FALSE, warning = FALSE, tidy = FALSE, eval=FALSE} -remotes::install_github('ohdsi/PhenotypeLibrary') +remotes::install_github("ohdsi/PhenotypeLibrary") ``` To extract the cohort definition for Alcoholism with an id of 1165, just run: diff --git a/vignettes/CreatingLearningCurves.Rmd b/vignettes/CreatingLearningCurves.Rmd index 0db8d4a79..6430f7c5b 100644 --- a/vignettes/CreatingLearningCurves.Rmd +++ b/vignettes/CreatingLearningCurves.Rmd @@ -40,15 +40,15 @@ knitr::opts_chunk$set(echo = TRUE) library(PatientLevelPrediction) vignetteDataFolder <- "s:/temp/plpVignette" # Load all needed data if it exists on this computer: -if (file.exists(vignetteDataFolder)){ - plpModel <- loadPlpModel(vignetteDataFolder,'model') - lrResults <- loadPlpModel(file.path(vignetteDataFolder,'results')) +if (file.exists(vignetteDataFolder)) { + plpModel <- loadPlpModel(vignetteDataFolder, "model") + lrResults <- loadPlpModel(file.path(vignetteDataFolder, "results")) } ``` # Introduction -This vignette describes how you can use the Observational Health Data Sciences and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package to create learning curves. This vignette assumes you have read and are comfortable with building patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf). +This vignette describes how you can use the Observational Health Data Sciences and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package to create learning curves. This vignette assumes you have read and are comfortable with building patient level prediction models as described in the `vignette('BuildingPredictiveModels')`. Prediction models will show overly-optimistic performance when predicting on the same data as used for training. Therefore, best-practice is to partition our data into a training set and testing set. We then train our prediction model on the training set portion and asses its ability to generalize to unseen data by measuring its performance on the testing set. @@ -78,7 +78,6 @@ plpData <- simulatePlpData( plpDataSimulationProfile, n = sampleSize ) - ``` Specify the population settings (this does additional exclusions such as requiring minimum prior observation or no prior outcome as well as specifying the time-at-risk period to enable labels to be created): @@ -103,18 +102,16 @@ Specify the prediction algorithm to be used. ```{r eval=FALSE} # Use LASSO logistic regression modelSettings <- setLassoLogisticRegression() - ``` Specify the split settings and a sequence of training set fractions (these over ride the splitSetting trainFraction). Alternatively, instead of `trainFractions`, you can provide a sequence of training events (`trainEvents`) instead of the training set fractions. This is recommended, because our research has shown that number of events is the important determinant of model performance. Make sure that your training set contains the number of events specified. ```{r eval = FALSE} - -splitSettings = createDefaultSplitSetting( - testFraction = 0.2, - type = 'stratified', +splitSettings <- createDefaultSplitSetting( + testFraction = 0.2, + type = "stratified", splitSeed = 1000 - ) +) trainFractions <- seq(0.1, 0.8, 0.1) # Create eight training set fractions @@ -127,31 +124,29 @@ Create the learning curve object. ```{r eval=FALSE} learningCurve <- createLearningCurve( plpData = plpData, - outcomeId = 2, - parallel = T, + outcomeId = 2, + parallel = TRUE, cores = 4, modelSettings = modelSettings, saveDirectory = getwd(), - analysisId = 'learningCurve', + analysisId = "learningCurve", populationSettings = populationSettings, splitSettings = splitSettings, trainFractions = trainFractions, trainEvents = NULL, preprocessSettings = createPreprocessSettings( minFraction = 0.001, - normalize = T + normalize = TRUE ), executeSettings = createExecuteSettings( - runSplitData = T, - runSampleData = F, - runfeatureEngineering = F, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = F - ) + runSplitData = TRUE, + runSampleData = FALSE, + runfeatureEngineering = FALSE, + runPreprocessData = TRUE, + runModelDevelopment = TRUE, + runCovariateSummary = FALSE + ) ) - - ``` Plot the learning curve object (Figure 4). Specify one of the available metrics: `AUROC`, `AUPRC`, `sBrier`. Moreover, you can specify what metric to put on the abscissa, number of `observations` or number of `events`. We recommend the latter, because `events` are determinant of model performance and allow you to better compare learning curves across different prediction problems and databases. @@ -159,10 +154,10 @@ Plot the learning curve object (Figure 4). Specify one of the available metrics: ```{r eval=FALSE} plotLearningCurve( learningCurve, - metric = 'AUROC', - abscissa = 'events', - plotTitle = 'Learning Curve', - plotSubtitle = 'AUROC performance' + metric = "AUROC", + abscissa = "events", + plotTitle = "Learning Curve", + plotSubtitle = "AUROC performance" ) ``` @@ -179,11 +174,11 @@ When running in parrallel, R will find the number of available processing cores We have added a demo of the learningcurve: ```{r eval=FALSE} -# Show all demos in our package: - demo(package = "PatientLevelPrediction") +# Show all demos in our package: +demo(package = "PatientLevelPrediction") # Run the learning curve - demo("LearningCurveDemo", package = "PatientLevelPrediction") +demo("LearningCurveDemo", package = "PatientLevelPrediction") ``` Do note that running this demo can take a considerable amount of time (15 min on Quad core running in parallel)! diff --git a/vignettes/CreatingNetworkStudies.Rmd b/vignettes/CreatingNetworkStudies.Rmd index f35961d35..88f91a779 100644 --- a/vignettes/CreatingNetworkStudies.Rmd +++ b/vignettes/CreatingNetworkStudies.Rmd @@ -25,10 +25,10 @@ output: library(PatientLevelPrediction) vignetteDataFolder <- "s:/temp/plpVignette" # Load all needed data if it exists on this computer: -if (file.exists(vignetteDataFolder)){ - plpModel <- loadPlpModel(file.path(vignetteDataFolder,'model')) - lrResults <- loadPlpModel(file.path(vignetteDataFolder,'results')) -} +if (file.exists(vignetteDataFolder)) { + plpModel <- loadPlpModel(file.path(vignetteDataFolder, "model")) + lrResults <- loadPlpModel(file.path(vignetteDataFolder, "results")) +} ``` ```{r, echo = FALSE, message = FALSE, warning = FALSE} diff --git a/vignettes/InstallationGuide.Rmd b/vignettes/InstallationGuide.Rmd index 91788b711..7b3af179d 100644 --- a/vignettes/InstallationGuide.Rmd +++ b/vignettes/InstallationGuide.Rmd @@ -80,8 +80,7 @@ Many of the classifiers in the `PatientLevelPrediction` use a Python backend. To ```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} library(PatientLevelPrediction) reticulate::install_miniconda() -configurePython(envname='r-reticulate', envtype='conda') - +configurePython(envname = "r-reticulate", envtype = "conda") ``` # Installation issues @@ -107,13 +106,13 @@ usethis::edit_r_profile() and add ```{r eval=FALSE} -Sys.setenv(PATH = paste("your python bin location", Sys.getenv("PATH"), sep=":")) +Sys.setenv(PATH = paste("your python bin location", Sys.getenv("PATH"), sep = ":")) ``` to the file then save. Where your python bin location is the location returned by ```{r eval=FALSE} -reticulate::conda_list() +reticulate::conda_list() ``` e.g., My PLP virtual environment location was /anaconda3/envs/PLP/bin/python so I added:\