From 6c1092be067cdc7decade0e43e2033f32af350f8 Mon Sep 17 00:00:00 2001 From: Steve Bronder Date: Wed, 3 Apr 2024 10:31:48 -0400 Subject: [PATCH] update stevebronder from ctb to auth --- DESCRIPTION | 2 +- Meta/vignette.rds | Bin 0 -> 355 bytes doc/cmdstanr-internals.R | 214 ++++++++++ doc/cmdstanr-internals.Rmd | 487 +++++++++++++++++++++ doc/cmdstanr-internals.html | 783 ++++++++++++++++++++++++++++++++++ doc/cmdstanr.R | 230 ++++++++++ doc/cmdstanr.Rmd | 554 ++++++++++++++++++++++++ doc/cmdstanr.html | 814 ++++++++++++++++++++++++++++++++++++ doc/posterior.R | 77 ++++ doc/posterior.Rmd | 126 ++++++ doc/posterior.html | 442 ++++++++++++++++++++ doc/profiling.R | 106 +++++ doc/profiling.Rmd | 233 +++++++++++ doc/profiling.html | 552 ++++++++++++++++++++++++ doc/r-markdown.R | 28 ++ doc/r-markdown.Rmd | 157 +++++++ doc/r-markdown.html | 493 ++++++++++++++++++++++ 17 files changed, 5297 insertions(+), 1 deletion(-) create mode 100644 Meta/vignette.rds create mode 100644 doc/cmdstanr-internals.R create mode 100644 doc/cmdstanr-internals.Rmd create mode 100644 doc/cmdstanr-internals.html create mode 100644 doc/cmdstanr.R create mode 100644 doc/cmdstanr.Rmd create mode 100644 doc/cmdstanr.html create mode 100644 doc/posterior.R create mode 100644 doc/posterior.Rmd create mode 100644 doc/posterior.html create mode 100644 doc/profiling.R create mode 100644 doc/profiling.Rmd create mode 100644 doc/profiling.html create mode 100644 doc/r-markdown.R create mode 100644 doc/r-markdown.Rmd create mode 100644 doc/r-markdown.html diff --git a/DESCRIPTION b/DESCRIPTION index 7ce3e65e2..2c50a8da5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,7 +17,7 @@ Authors@R: email = "will.landau@gmail.com", comment = c(ORCID = "0000-0003-1878-3253")), person(given = "Jacob", family = "Socolar", role = "ctb"), person(given = "Martin", family = "Modrák", role = "ctb"), - person(given = "Steve", family = "Bronder", role = "ctb")) + person(given = "Steve", family = "Bronder", role = "aut")) Description: A lightweight interface to 'Stan' . The 'CmdStanR' interface is an alternative to 'RStan' that calls the command line interface for compilation and running algorithms instead of interfacing diff --git a/Meta/vignette.rds b/Meta/vignette.rds new file mode 100644 index 0000000000000000000000000000000000000000..a73df1ca153a9bed68fdbc79dd8b22c732268799 GIT binary patch literal 355 zcmV-p0i6CHiwFP!000001C>+3P69CyEwG@Vpb19f!K4?D@CzP9G%+RwP4wE(0u9@i zv`seN{BIL*cI_-(!a)x+-M%;NdztBebO<4Jk~APgkVYR;3*sE2NfOeA^>U`T^Gw@; z)LvN4Qa79`9@tZ4BdohN4l1Rog4WDsD)k})dv>6hU2&7?0<9`vJ(-MhBOJX~{1rr| zG&6Q}7wDTu;XT|(LEn18X(s)$EX6zC;Y40kkP|fK0)0C{S2RCG!Q{d+!xyw@datS-G1*onx)L4BLq8~c{ZHG^U2@C<_F(3>Rnv}006d8 BrYHaa literal 0 HcmV?d00001 diff --git a/doc/cmdstanr-internals.R b/doc/cmdstanr-internals.R new file mode 100644 index 000000000..297b14e2c --- /dev/null +++ b/doc/cmdstanr-internals.R @@ -0,0 +1,214 @@ +params <- +list(EVAL = FALSE) + +## ----settings-knitr, include=FALSE-------------------------------------------- +stopifnot(require(knitr)) +opts_chunk$set( + # collapse = TRUE, + dev = "png", + dpi = 150, + fig.asp = 0.618, + fig.width = 5, + out.width = "60%", + fig.align = "center", + comment = NA, + eval = if (isTRUE(exists("params"))) params$EVAL else FALSE +) + +## ----setup, message=FALSE----------------------------------------------------- +# library(cmdstanr) +# check_cmdstan_toolchain(fix = TRUE, quiet = TRUE) + +## ----start-clean, include=FALSE----------------------------------------------- +# exe <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli") +# unlink(exe) + +## ----compile------------------------------------------------------------------ +# stan_file <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli.stan") +# mod <- cmdstan_model(stan_file) +# mod$print() +# mod$stan_file() +# mod$exe_file() + +## ----already-compiled--------------------------------------------------------- +# mod <- cmdstan_model(stan_file) + +## ----compile-options, eval=FALSE---------------------------------------------- +# mod <- cmdstan_model( +# stan_file, +# force_recompile = TRUE, +# include_paths = "paths/to/directories/with/included/files", +# cpp_options = list(stan_threads = TRUE, STANC2 = TRUE) +# ) + +## ----compile-method----------------------------------------------------------- +# unlink(mod$exe_file()) +# mod <- cmdstan_model(stan_file, compile = FALSE) +# mod$exe_file() # not yet created +# mod$compile() +# mod$exe_file() + +## ----stan_file_pedantic------------------------------------------------------- +# stan_file_pedantic <- write_stan_file(" +# data { +# int N; +# array[N] int y; +# } +# parameters { +# // should have but omitting to demonstrate pedantic mode +# real lambda; +# } +# model { +# y ~ poisson(lambda); +# } +# ") + +## ----pedantic-compile, collapse = TRUE---------------------------------------- +# mod_pedantic <- cmdstan_model(stan_file_pedantic, pedantic = TRUE) + +## ----pedantic-check_syntax, collapse=TRUE------------------------------------- +# mod_pedantic$check_syntax(pedantic = TRUE) + +## ----pedantic-check_syntax-2, collapse=TRUE----------------------------------- +# file.remove(mod_pedantic$exe_file()) # delete compiled executable +# rm(mod_pedantic) +# +# mod_pedantic <- cmdstan_model(stan_file_pedantic, compile = FALSE) +# mod_pedantic$check_syntax(pedantic = TRUE) + +## ----stan_file_variables------------------------------------------------------ +# stan_file_variables <- write_stan_file(" +# data { +# int J; +# vector[J] sigma; +# vector[J] y; +# } +# parameters { +# real mu; +# real tau; +# vector[J] theta_raw; +# } +# transformed parameters { +# vector[J] theta = mu + tau * theta_raw; +# } +# model { +# target += normal_lpdf(tau | 0, 10); +# target += normal_lpdf(mu | 0, 10); +# target += normal_lpdf(theta_raw | 0, 1); +# target += normal_lpdf(y | theta, sigma); +# } +# ") +# mod_v <- cmdstan_model(stan_file_variables) +# variables <- mod_v$variables() + +## ----variables-list-names----------------------------------------------------- +# names(variables) +# names(variables$data) +# names(variables$parameters) +# names(variables$transformed_parameters) +# names(variables$generated_quantities) + +## ----variable-type-dims------------------------------------------------------- +# variables$data$J +# variables$data$sigma +# variables$parameters$tau +# variables$transformed_parameters$theta + +## ----compile-with-dir, eval = FALSE------------------------------------------- +# mod <- cmdstan_model(stan_file, dir = "path/to/directory/for/executable") + +## ----print-program-again------------------------------------------------------ +# mod$print() + +## ----data-list, eval=FALSE---------------------------------------------------- +# # data block has 'N' and 'y' +# data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) +# fit <- mod$sample(data = data_list) + +## ----write_stan_json---------------------------------------------------------- +# data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) +# json_file <- tempfile(fileext = ".json") +# write_stan_json(data_list, json_file) +# cat(readLines(json_file), sep = "\n") + +## ----data-json, eval=FALSE---------------------------------------------------- +# fit <- mod$sample(data = json_file) + +## ----data-rdump, eval=FALSE--------------------------------------------------- +# rdump_file <- tempfile(fileext = ".data.R") +# rstan::stan_rdump(names(data_list), file = rdump_file, envir = list2env(data_list)) +# cat(readLines(rdump_file), sep = "\n") +# fit <- mod$sample(data = rdump_file) + +## ----sample-tempdir, results = "hide"----------------------------------------- +# data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) +# fit <- mod$sample(data = data_list) + +## ----output_files------------------------------------------------------------- +# fit$output_files() + +## ----gc----------------------------------------------------------------------- +# files <- fit$output_files() +# file.exists(files) +# +# rm(fit) +# gc() +# +# file.exists(files) + +## ----save_output_files, eval=FALSE-------------------------------------------- +# # see ?save_output_files for info on optional arguments +# fit$save_output_files(dir = "path/to/directory") + +## ----output_dir, eval = FALSE------------------------------------------------- +# fit <- mod$sample( +# data = data_list, +# output_dir = "path/to/directory" +# ) + +## ----refit, include=FALSE----------------------------------------------------- +# fit <- mod$sample(data = data_list) + +## ----csv-not-read------------------------------------------------------------- +# str(fit) + +## ----for-csv-reading---------------------------------------------------------- +# draws <- fit$draws() # force CSVs to be read into R +# str(fit) + +## ----read_cmdstan_csv--------------------------------------------------------- +# # see ?read_cmdstan_csv for info on optional arguments controlling +# # what information is read in +# csv_contents <- read_cmdstan_csv(fit$output_files()) +# str(csv_contents) + +## ----as_cmdstan_fit----------------------------------------------------------- +# fit2 <- as_cmdstan_fit(fit$output_files()) + +## ----save_latent_dynamics, results = "hide"----------------------------------- +# fit <- mod$sample(data = data_list, save_latent_dynamics = TRUE) + +## ----read-latent-dynamics----------------------------------------------------- +# fit$latent_dynamics_files() +# +# # read one of the files in +# x <- utils::read.csv(fit$latent_dynamics_files()[1], comment.char = "#") +# head(x) + +## ----explore-latent-dynamics-------------------------------------------------- +# head(x[, c("theta", "p_theta", "g_theta")]) + +## ----verbose-mode------------------------------------------------------------- +# options("cmdstanr_verbose"=TRUE) +# +# mod <- cmdstan_model(stan_file, force_recompile = TRUE) +# fit <- mod$sample( +# data = data_list, +# chains = 1, +# iter_warmup = 100, +# iter_sampling = 100 +# ) + +## ----include=FALSE------------------------------------------------------------ +# options("cmdstanr_verbose" = FALSE) + diff --git a/doc/cmdstanr-internals.Rmd b/doc/cmdstanr-internals.Rmd new file mode 100644 index 000000000..96ef06ea0 --- /dev/null +++ b/doc/cmdstanr-internals.Rmd @@ -0,0 +1,487 @@ +--- +title: "How does CmdStanR work?" +author: "Jonah Gabry and Rok Češnovar" +output: + rmarkdown::html_vignette: + toc: true + toc_depth: 4 +params: + EVAL: !r identical(Sys.getenv("NOT_CRAN"), "true") +vignette: > + %\VignetteIndexEntry{How does CmdStanR work?} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r child="children/_settings-knitr.Rmd"} +``` + +## Introduction + +This vignette is intended to be read after the [_Getting started with CmdStanR_](http://mc-stan.org/cmdstanr/articles/cmdstanr.html) +vignette. Please read that first for important background. In this document we +provide additional details about compiling models, passing in data, and how +CmdStan output is saved and read back into R. + +We will only use the `$sample()` method in examples, but all model fitting +methods work in a similar way under the hood. + +```{r setup, message=FALSE} +library(cmdstanr) +check_cmdstan_toolchain(fix = TRUE, quiet = TRUE) +``` + +## Compilation + +### Immediate compilation + +The `cmdstan_model()` function creates a new `CmdStanModel` object. The +`CmdStanModel` object stores the path to a Stan program as well as the +path to a compiled executable. + +```{r start-clean, include=FALSE} +exe <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli") +unlink(exe) +``` + +```{r compile} +stan_file <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli.stan") +mod <- cmdstan_model(stan_file) +mod$print() +mod$stan_file() +mod$exe_file() +``` + +Subsequently, if you create a `CmdStanModel` object from the same Stan file +then compilation will be skipped (assuming the file hasn't changed). + +```{r already-compiled} +mod <- cmdstan_model(stan_file) +``` + +Internally, `cmdstan_model()` first creates the `CmdStanModel` object from +just the Stan file and then calls its [`$compile()`](http://mc-stan.org/cmdstanr/reference/model-method-compile.html) +method. Optional arguments to the `$compile()` method can be passed via `...`. + +```{r compile-options, eval=FALSE} +mod <- cmdstan_model( + stan_file, + force_recompile = TRUE, + include_paths = "paths/to/directories/with/included/files", + cpp_options = list(stan_threads = TRUE, STANC2 = TRUE) +) +``` + + + +### Delayed compilation + +It is also possible to delay compilation when creating the `CmdStanModel` object +by specifying `compile=FALSE` and then later calling the `$compile()` method +directly. + +```{r compile-method} +unlink(mod$exe_file()) +mod <- cmdstan_model(stan_file, compile = FALSE) +mod$exe_file() # not yet created +mod$compile() +mod$exe_file() +``` + +### Pedantic check + +If you are using CmdStan version 2.24 or later and CmdStanR version 0.2.1 or +later, you can run a pedantic check for your model. CmdStanR will always check +that your Stan program does not contain any invalid syntax but with pedantic +mode enabled the check will also warn you about other potential issues in your +model, for example: + +- Distribution usages issues: distribution arguments do not match the +distribution specification, or some specific distribution is used in an +inadvisable way. +- Unused parameter: a parameter is defined but does not contribute to target. +- Large or small constant in a distribution: very large or very small constants +are used as distribution arguments. +- Control flow depends on a parameter: branching control flow (like if/else) +depends on a parameter value. +- Parameter has multiple twiddles: a parameter is on the left-hand side of +multiple twiddles (i.e., multiple `~` symbols). +- Parameter has zero or multiple priors: a parameter has zero or more than one +prior distribution. +- Variable is used before assignment: a variable is used before being assigned a +value. +- Strict or nonsensical parameter bounds: a parameter is given questionable +bounds. + +For the latest information on the checks performed in pedantic mode see the +[Pedantic mode chapter](https://mc-stan.org/docs/stan-users-guide/pedantic-mode.html) +in the Stan Reference Manual. + +Pedantic mode is available when compiling the model or when using the separate +`$check_syntax()` method of a `CmdStanModel` object. Internally this corresponds +to setting the `stanc` (Stan transpiler) option `warn-pedantic`. Here we +demonstrate pedantic mode with a Stan program that is syntactically correct but +is missing a lower bound and a prior for a parameter. + +```{r stan_file_pedantic} +stan_file_pedantic <- write_stan_file(" +data { + int N; + array[N] int y; +} +parameters { + // should have but omitting to demonstrate pedantic mode + real lambda; +} +model { + y ~ poisson(lambda); +} +") +``` + +To turn on pedantic mode at compile time you can set `pedantic=TRUE` in +the call to `cmdstan_model()` (or when calling the `$compile()` method directly +if using the delayed compilation approach described above). + +```{r pedantic-compile, collapse = TRUE} +mod_pedantic <- cmdstan_model(stan_file_pedantic, pedantic = TRUE) +``` + +To turn on pedantic mode separately from compilation use the `pedantic` argument +to the `$check_syntax()` method. + +```{r pedantic-check_syntax, collapse=TRUE} +mod_pedantic$check_syntax(pedantic = TRUE) +``` + +Using `pedantic=TRUE` via the `$check_syntax()` method also has the advantage +that it can be used even if the model hasn't been compiled yet. This can be +helpful because the pedantic and syntax checks themselves are much faster than +compilation. + +```{r pedantic-check_syntax-2, collapse=TRUE} +file.remove(mod_pedantic$exe_file()) # delete compiled executable +rm(mod_pedantic) + +mod_pedantic <- cmdstan_model(stan_file_pedantic, compile = FALSE) +mod_pedantic$check_syntax(pedantic = TRUE) +``` + +### Stan model variables + +If using CmdStan 2.27 or newer, you can obtain the names, types +and dimensions of the data, parameters, transformed parameters +and generated quantities variables of a Stan model using the +`$variables()` method of the `CmdStanModel` object. + +```{r stan_file_variables} +stan_file_variables <- write_stan_file(" +data { + int J; + vector[J] sigma; + vector[J] y; +} +parameters { + real mu; + real tau; + vector[J] theta_raw; +} +transformed parameters { + vector[J] theta = mu + tau * theta_raw; +} +model { + target += normal_lpdf(tau | 0, 10); + target += normal_lpdf(mu | 0, 10); + target += normal_lpdf(theta_raw | 0, 1); + target += normal_lpdf(y | theta, sigma); +} +") +mod_v <- cmdstan_model(stan_file_variables) +variables <- mod_v$variables() +``` + +The `$variables()` method returns a list with `data`, `parameters`, +`transformed_parameters` and `generated_quantities` elements, each +corresponding to variables in their respective block of the program. Transformed +data variables are not listed as they are not used in the model's input +or output. + +```{r variables-list-names} +names(variables) +names(variables$data) +names(variables$parameters) +names(variables$transformed_parameters) +names(variables$generated_quantities) +``` + +Each variable is represented as a list containing the type +information (currently limited to `real` or `int`) and the number of dimensions. + +```{r variable-type-dims} +variables$data$J +variables$data$sigma +variables$parameters$tau +variables$transformed_parameters$theta +``` + +### Executable location + +By default, the executable is created in the same directory as the file +containing the Stan program. You can also specify a different location with the +`dir` argument. + +```{r compile-with-dir, eval = FALSE} +mod <- cmdstan_model(stan_file, dir = "path/to/directory/for/executable") +``` + +## Processing data + +There are three data formats that CmdStanR allows when fitting a model: + +* named list of R objects +* JSON file +* R dump file + +### Named list of R objects + +Like the RStan interface, CmdStanR accepts a named list of R objects where the +names correspond to variables declared in the data block of the Stan program. +In the Bernoulli model the data is `N`, the number of data points, and `y` +an integer array of observations. + +```{r print-program-again} +mod$print() +``` + +```{r data-list, eval=FALSE} +# data block has 'N' and 'y' +data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) +fit <- mod$sample(data = data_list) +``` + +Because CmdStan doesn't accept lists of R objects, CmdStanR will first write the +data to a temporary JSON file using `write_stan_json()`. This happens +internally, but it is also possible to call `write_stan_json()` directly. + +```{r write_stan_json} +data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) +json_file <- tempfile(fileext = ".json") +write_stan_json(data_list, json_file) +cat(readLines(json_file), sep = "\n") +``` + +### JSON file + +If you already have your data in a JSON file you can just pass that file +directly to CmdStanR instead of using a list of R objects. For example, we could +pass in the JSON file we created above using `write_stan_json()`: + +```{r data-json, eval=FALSE} +fit <- mod$sample(data = json_file) +``` + + +### R dump file + +Finally, it is also possible to use the R dump file format. This is *not* +recommended because CmdStan can process JSON faster than R dump, but CmdStanR +allows it because CmdStan will accept files created by `rstan::stan_rdump()`: + +```{r data-rdump, eval=FALSE} +rdump_file <- tempfile(fileext = ".data.R") +rstan::stan_rdump(names(data_list), file = rdump_file, envir = list2env(data_list)) +cat(readLines(rdump_file), sep = "\n") +fit <- mod$sample(data = rdump_file) +``` + + +## Writing CmdStan output to CSV + +### Default temporary files + +```{r sample-tempdir, results = "hide"} +data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) +fit <- mod$sample(data = data_list) +``` + +When fitting a model, the default behavior is to write the output from CmdStan +to CSV files in a temporary directory. + +```{r output_files} +fit$output_files() +``` + +These files will be lost if you end your R session or if you remove the +`fit` object and force (or wait for) garbage collection. + + +```{r gc} +files <- fit$output_files() +file.exists(files) + +rm(fit) +gc() + +file.exists(files) +``` + +### Non-temporary files + +To save these files to a non-temporary location there are two options. You +can either specify the `output_dir` argument to `mod$sample()` or use +`fit$save_output_files()` after fitting the model. + +```{r save_output_files, eval=FALSE} +# see ?save_output_files for info on optional arguments +fit$save_output_files(dir = "path/to/directory") +``` + +```{r output_dir, eval = FALSE} +fit <- mod$sample( + data = data_list, + output_dir = "path/to/directory" +) +``` + + +## Reading CmdStan output into R + +### Lazy CSV reading + +With the exception of some diagnostic information, the CSV files are not read +into R until their contents are requested by calling a method that requires them +(e.g., `fit$draws()`, `fit$summary()`, etc.). If we examine the structure of the +`fit` object, notice how the `Private` slot `draws_` is `NULL`, indicating that +the CSV files haven't yet been read into R. + +```{r refit, include=FALSE} +fit <- mod$sample(data = data_list) +``` +```{r csv-not-read} +str(fit) +``` + +After we call a method that requires the draws then if we reexamine the +structure of the object we will see that the `draws_` slot in `Private` +is no longer empty. + +```{r for-csv-reading} +draws <- fit$draws() # force CSVs to be read into R +str(fit) +``` + +For models with many parameters, transformed parameters, or generated +quantities, if only some are requested (e.g., by specifying the `variables` +argument to `fit$draws()`) then CmdStanR will only read in the requested +variables (unless they have already been read in). + +### read_cmdstan_csv() + +Internally, the `read_cmdstan_csv()` function is used to read the CmdStan CSV +files into R. This function is exposed to users, so you can also call it +directly. + +```{r read_cmdstan_csv} +# see ?read_cmdstan_csv for info on optional arguments controlling +# what information is read in +csv_contents <- read_cmdstan_csv(fit$output_files()) +str(csv_contents) +``` + +### as_cmdstan_fit() + +If you need to manually create fitted model objects from CmdStan CSV files use +`as_cmdstan_fit()`. + +```{r as_cmdstan_fit} +fit2 <- as_cmdstan_fit(fit$output_files()) +``` + +This is pointless in our case since we have the original `fit` object, but this +function can be used to create fitted model objects (`CmdStanMCMC`, +`CmdStanMLE`, etc.) from any CmdStan CSV files. + +### Saving and accessing advanced algorithm info (latent dynamics) + +If `save_latent_dynamics` is set to `TRUE` when running the `$sample()` method +then additional CSV files are created (one per chain) that provide access to +quantities used under the hood by Stan's implementation of dynamic Hamiltonian +Monte Carlo. + +CmdStanR does not yet provide a special method for processing these files but +they can be read into R using R's standard CSV reading functions. + + +```{r save_latent_dynamics, results = "hide"} +fit <- mod$sample(data = data_list, save_latent_dynamics = TRUE) +``` +```{r read-latent-dynamics} +fit$latent_dynamics_files() + +# read one of the files in +x <- utils::read.csv(fit$latent_dynamics_files()[1], comment.char = "#") +head(x) +``` + +The column `lp__` is also provided via `fit$draws()`, and the columns +`accept_stat__`, `stepsize__`, `treedepth__`, `n_leapfrog__`, `divergent__`, and +`energy__` are also provided by `fit$sampler_diagnostics()`, but there are +several columns unique to the latent dynamics file. + +```{r explore-latent-dynamics} +head(x[, c("theta", "p_theta", "g_theta")]) +``` + +Our model has a single parameter `theta` and the three columns above correspond +to `theta` in the _unconstrained_ space (`theta` on the constrained space is +accessed via `fit$draws()`), the auxiliary momentum `p_theta`, and the gradient +`g_theta`. In general, each of these three columns will exist for _every_ +parameter in the model. + + +## Developing using CmdStanR + +CmdStanR can of course be used for developing other packages that require compiling +and running Stan models as well as using new or custom Stan features available +through CmdStan. + +### Pre-compiled Stan models in R packages + +You may compile a Stan model at runtime (e.g. just before sampling), +or you may compile all the models inside the package file system in advance at installation time. +The latter avoids compilations at runtime, which matters in centrally managed R installations +where users should not compile their own software. + +To pre-compile all the models in a package, +you may create top-level scripts `configure` and `configure.win` +which run `cmdstan_model()` with `compile = TRUE` and save the compiled executables +somewhere inside the `inst/` folder of the package source. +The [`instantiate`](https://wlandau.github.io/instantiate/) package helps developers +configure packages this way, +and it documents other topics such as submitting to CRAN and administering CmdStan. +Kevin Ushey's [`configure`](https://github.com/kevinushey/configure) package helps +create and manage package configuration files in general. + + +### Troubleshooting and debugging + +When developing or testing new features it might be useful to have more +information on how CmdStan is called internally and to see more information +printed when compiling or running models. This can be enabled for an entire R +session by setting the option `"cmdstanr_verbose"` to `TRUE`. + +```{r verbose-mode} +options("cmdstanr_verbose"=TRUE) + +mod <- cmdstan_model(stan_file, force_recompile = TRUE) +fit <- mod$sample( + data = data_list, + chains = 1, + iter_warmup = 100, + iter_sampling = 100 +) +``` + +```{r include=FALSE} +options("cmdstanr_verbose" = FALSE) +``` diff --git a/doc/cmdstanr-internals.html b/doc/cmdstanr-internals.html new file mode 100644 index 000000000..4d13fcd17 --- /dev/null +++ b/doc/cmdstanr-internals.html @@ -0,0 +1,783 @@ + + + + + + + + + + + + + + + +How does CmdStanR work? + + + + + + + + + + + + + + + + + + + + + + + + + + +

How does CmdStanR work?

+

Jonah Gabry and Rok Češnovar

+ + + + +
+

Introduction

+

This vignette is intended to be read after the Getting +started with CmdStanR vignette. Please read that first for +important background. In this document we provide additional details +about compiling models, passing in data, and how CmdStan output is saved +and read back into R.

+

We will only use the $sample() method in examples, but +all model fitting methods work in a similar way under the hood.

+
library(cmdstanr)
+check_cmdstan_toolchain(fix = TRUE, quiet = TRUE)
+
+
+

Compilation

+
+

Immediate compilation

+

The cmdstan_model() function creates a new +CmdStanModel object. The CmdStanModel object +stores the path to a Stan program as well as the path to a compiled +executable.

+
stan_file <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli.stan")
+mod <- cmdstan_model(stan_file)
+mod$print()
+mod$stan_file()
+mod$exe_file()
+

Subsequently, if you create a CmdStanModel object from +the same Stan file then compilation will be skipped (assuming the file +hasn’t changed).

+
mod <- cmdstan_model(stan_file)
+

Internally, cmdstan_model() first creates the +CmdStanModel object from just the Stan file and then calls +its $compile() +method. Optional arguments to the $compile() method can be +passed via ....

+
mod <- cmdstan_model(
+  stan_file,
+  force_recompile = TRUE,
+  include_paths = "paths/to/directories/with/included/files",
+  cpp_options = list(stan_threads = TRUE, STANC2 = TRUE)
+)
+
+
+

Delayed compilation

+

It is also possible to delay compilation when creating the +CmdStanModel object by specifying +compile=FALSE and then later calling the +$compile() method directly.

+
unlink(mod$exe_file())
+mod <- cmdstan_model(stan_file, compile = FALSE)
+mod$exe_file() # not yet created
+mod$compile()
+mod$exe_file()
+
+
+

Pedantic check

+

If you are using CmdStan version 2.24 or later and CmdStanR version +0.2.1 or later, you can run a pedantic check for your model. CmdStanR +will always check that your Stan program does not contain any invalid +syntax but with pedantic mode enabled the check will also warn you about +other potential issues in your model, for example:

+
    +
  • Distribution usages issues: distribution arguments do not match the +distribution specification, or some specific distribution is used in an +inadvisable way.
  • +
  • Unused parameter: a parameter is defined but does not contribute to +target.
  • +
  • Large or small constant in a distribution: very large or very small +constants are used as distribution arguments.
  • +
  • Control flow depends on a parameter: branching control flow (like +if/else) depends on a parameter value.
  • +
  • Parameter has multiple twiddles: a parameter is on the left-hand +side of multiple twiddles (i.e., multiple ~ symbols).
  • +
  • Parameter has zero or multiple priors: a parameter has zero or more +than one prior distribution.
  • +
  • Variable is used before assignment: a variable is used before being +assigned a value.
  • +
  • Strict or nonsensical parameter bounds: a parameter is given +questionable bounds.
  • +
+

For the latest information on the checks performed in pedantic mode +see the Pedantic +mode chapter in the Stan Reference Manual.

+

Pedantic mode is available when compiling the model or when using the +separate $check_syntax() method of a +CmdStanModel object. Internally this corresponds to setting +the stanc (Stan transpiler) option +warn-pedantic. Here we demonstrate pedantic mode with a +Stan program that is syntactically correct but is missing a lower bound +and a prior for a parameter.

+
stan_file_pedantic <- write_stan_file("
+data {
+  int N;
+  array[N] int y;
+}
+parameters {
+  // should have <lower=0> but omitting to demonstrate pedantic mode
+  real lambda;
+}
+model {
+  y ~ poisson(lambda);
+}
+")
+

To turn on pedantic mode at compile time you can set +pedantic=TRUE in the call to cmdstan_model() +(or when calling the $compile() method directly if using +the delayed compilation approach described above).

+
mod_pedantic <- cmdstan_model(stan_file_pedantic, pedantic = TRUE)
+

To turn on pedantic mode separately from compilation use the +pedantic argument to the $check_syntax() +method.

+
mod_pedantic$check_syntax(pedantic = TRUE)
+

Using pedantic=TRUE via the $check_syntax() +method also has the advantage that it can be used even if the model +hasn’t been compiled yet. This can be helpful because the pedantic and +syntax checks themselves are much faster than compilation.

+
file.remove(mod_pedantic$exe_file()) # delete compiled executable
+rm(mod_pedantic)
+
+mod_pedantic <- cmdstan_model(stan_file_pedantic, compile = FALSE)
+mod_pedantic$check_syntax(pedantic = TRUE)
+
+
+

Stan model variables

+

If using CmdStan 2.27 or newer, you can obtain the names, types and +dimensions of the data, parameters, transformed parameters and generated +quantities variables of a Stan model using the $variables() +method of the CmdStanModel object.

+
stan_file_variables <- write_stan_file("
+data {
+  int<lower=1> J;
+  vector<lower=0>[J] sigma;
+  vector[J] y;
+}
+parameters {
+  real mu;
+  real<lower=0> tau;
+  vector[J] theta_raw;
+}
+transformed parameters {
+  vector[J] theta = mu + tau * theta_raw;
+}
+model {
+  target += normal_lpdf(tau | 0, 10);
+  target += normal_lpdf(mu | 0, 10);
+  target += normal_lpdf(theta_raw | 0, 1);
+  target += normal_lpdf(y | theta, sigma);
+}
+")
+mod_v <- cmdstan_model(stan_file_variables)
+variables <- mod_v$variables()
+

The $variables() method returns a list with +data, parameters, +transformed_parameters and +generated_quantities elements, each corresponding to +variables in their respective block of the program. Transformed data +variables are not listed as they are not used in the model’s input or +output.

+
names(variables)
+names(variables$data)
+names(variables$parameters)
+names(variables$transformed_parameters)
+names(variables$generated_quantities)
+

Each variable is represented as a list containing the type +information (currently limited to real or int) +and the number of dimensions.

+
variables$data$J
+variables$data$sigma
+variables$parameters$tau
+variables$transformed_parameters$theta
+
+
+

Executable location

+

By default, the executable is created in the same directory as the +file containing the Stan program. You can also specify a different +location with the dir argument.

+
mod <- cmdstan_model(stan_file, dir = "path/to/directory/for/executable")
+
+
+
+

Processing data

+

There are three data formats that CmdStanR allows when fitting a +model:

+
    +
  • named list of R objects
  • +
  • JSON file
  • +
  • R dump file
  • +
+
+

Named list of R objects

+

Like the RStan interface, CmdStanR accepts a named list of R objects +where the names correspond to variables declared in the data block of +the Stan program. In the Bernoulli model the data is N, the +number of data points, and y an integer array of +observations.

+
mod$print()
+
# data block has 'N' and 'y'
+data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1))
+fit <- mod$sample(data = data_list)
+

Because CmdStan doesn’t accept lists of R objects, CmdStanR will +first write the data to a temporary JSON file using +write_stan_json(). This happens internally, but it is also +possible to call write_stan_json() directly.

+
data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1))
+json_file <- tempfile(fileext = ".json")
+write_stan_json(data_list, json_file)
+cat(readLines(json_file), sep = "\n")
+
+
+

JSON file

+

If you already have your data in a JSON file you can just pass that +file directly to CmdStanR instead of using a list of R objects. For +example, we could pass in the JSON file we created above using +write_stan_json():

+
fit <- mod$sample(data = json_file)
+
+
+

R dump file

+

Finally, it is also possible to use the R dump file format. This is +not recommended because CmdStan can process JSON faster than R +dump, but CmdStanR allows it because CmdStan will accept files created +by rstan::stan_rdump():

+
rdump_file <- tempfile(fileext = ".data.R")
+rstan::stan_rdump(names(data_list), file = rdump_file, envir = list2env(data_list))
+cat(readLines(rdump_file), sep = "\n")
+fit <- mod$sample(data = rdump_file)
+
+
+
+

Writing CmdStan output to CSV

+
+

Default temporary files

+
data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1))
+fit <- mod$sample(data = data_list)
+

When fitting a model, the default behavior is to write the output +from CmdStan to CSV files in a temporary directory.

+
fit$output_files()
+

These files will be lost if you end your R session or if you remove +the fit object and force (or wait for) garbage +collection.

+
files <- fit$output_files()
+file.exists(files)
+
+rm(fit)
+gc()
+
+file.exists(files)
+
+
+

Non-temporary files

+

To save these files to a non-temporary location there are two +options. You can either specify the output_dir argument to +mod$sample() or use fit$save_output_files() +after fitting the model.

+
# see ?save_output_files for info on optional arguments
+fit$save_output_files(dir = "path/to/directory")
+
fit <- mod$sample(
+  data = data_list,
+  output_dir = "path/to/directory"
+)
+
+
+
+

Reading CmdStan output into R

+
+

Lazy CSV reading

+

With the exception of some diagnostic information, the CSV files are +not read into R until their contents are requested by calling a method +that requires them (e.g., fit$draws(), +fit$summary(), etc.). If we examine the structure of the +fit object, notice how the Private slot +draws_ is NULL, indicating that the CSV files +haven’t yet been read into R.

+
str(fit)
+

After we call a method that requires the draws then if we reexamine +the structure of the object we will see that the draws_ +slot in Private is no longer empty.

+
draws <- fit$draws() # force CSVs to be read into R
+str(fit)
+

For models with many parameters, transformed parameters, or generated +quantities, if only some are requested (e.g., by specifying the +variables argument to fit$draws()) then +CmdStanR will only read in the requested variables (unless they have +already been read in).

+
+
+

read_cmdstan_csv()

+

Internally, the read_cmdstan_csv() function is used to +read the CmdStan CSV files into R. This function is exposed to users, so +you can also call it directly.

+
# see ?read_cmdstan_csv for info on optional arguments controlling
+# what information is read in
+csv_contents <- read_cmdstan_csv(fit$output_files())
+str(csv_contents)
+
+
+

as_cmdstan_fit()

+

If you need to manually create fitted model objects from CmdStan CSV +files use as_cmdstan_fit().

+
fit2 <- as_cmdstan_fit(fit$output_files())
+

This is pointless in our case since we have the original +fit object, but this function can be used to create fitted +model objects (CmdStanMCMC, CmdStanMLE, etc.) +from any CmdStan CSV files.

+
+
+

Saving and accessing advanced algorithm info (latent dynamics)

+

If save_latent_dynamics is set to TRUE when +running the $sample() method then additional CSV files are +created (one per chain) that provide access to quantities used under the +hood by Stan’s implementation of dynamic Hamiltonian Monte Carlo.

+

CmdStanR does not yet provide a special method for processing these +files but they can be read into R using R’s standard CSV reading +functions.

+
fit <- mod$sample(data = data_list, save_latent_dynamics = TRUE)
+
fit$latent_dynamics_files()
+
+# read one of the files in
+x <- utils::read.csv(fit$latent_dynamics_files()[1], comment.char = "#")
+head(x)
+

The column lp__ is also provided via +fit$draws(), and the columns accept_stat__, +stepsize__, treedepth__, +n_leapfrog__, divergent__, and +energy__ are also provided by +fit$sampler_diagnostics(), but there are several columns +unique to the latent dynamics file.

+
head(x[, c("theta", "p_theta", "g_theta")])
+

Our model has a single parameter theta and the three +columns above correspond to theta in the +unconstrained space (theta on the constrained +space is accessed via fit$draws()), the auxiliary momentum +p_theta, and the gradient g_theta. In general, +each of these three columns will exist for every parameter in +the model.

+
+
+
+

Developing using CmdStanR

+

CmdStanR can of course be used for developing other packages that +require compiling and running Stan models as well as using new or custom +Stan features available through CmdStan.

+
+

Pre-compiled Stan models in R packages

+

You may compile a Stan model at runtime (e.g. just before sampling), +or you may compile all the models inside the package file system in +advance at installation time. The latter avoids compilations at runtime, +which matters in centrally managed R installations where users should +not compile their own software.

+

To pre-compile all the models in a package, you may create top-level +scripts configure and configure.win which run +cmdstan_model() with compile = TRUE and save +the compiled executables somewhere inside the inst/ folder +of the package source. The instantiate +package helps developers configure packages this way, and it documents +other topics such as submitting to CRAN and administering CmdStan. Kevin +Ushey’s configure +package helps create and manage package configuration files in +general.

+
+
+

Troubleshooting and debugging

+

When developing or testing new features it might be useful to have +more information on how CmdStan is called internally and to see more +information printed when compiling or running models. This can be +enabled for an entire R session by setting the option +"cmdstanr_verbose" to TRUE.

+
options("cmdstanr_verbose"=TRUE)
+
+mod <- cmdstan_model(stan_file, force_recompile = TRUE)
+fit <- mod$sample(
+  data = data_list,
+  chains = 1,
+  iter_warmup = 100,
+  iter_sampling = 100
+)
+
+
+ + + + + + + + + + + diff --git a/doc/cmdstanr.R b/doc/cmdstanr.R new file mode 100644 index 000000000..ce0e942f8 --- /dev/null +++ b/doc/cmdstanr.R @@ -0,0 +1,230 @@ +params <- +list(EVAL = FALSE) + +## ----settings-knitr, include=FALSE-------------------------------------------- +stopifnot(require(knitr)) +opts_chunk$set( + # collapse = TRUE, + dev = "png", + dpi = 150, + fig.asp = 0.618, + fig.width = 5, + out.width = "60%", + fig.align = "center", + comment = NA, + eval = if (isTRUE(exists("params"))) params$EVAL else FALSE +) + +## ----install, eval=FALSE------------------------------------------------------ +# # we recommend running this is a fresh R session or restarting your current session +# install.packages("cmdstanr", repos = c("https://mc-stan.org/r-packages/", getOption("repos"))) + +## ----library, message=FALSE--------------------------------------------------- +# library(cmdstanr) +# library(posterior) +# library(bayesplot) +# color_scheme_set("brightblue") + +## ----check-toolchain---------------------------------------------------------- +# check_cmdstan_toolchain() + +## ----install_cmdstan-1, include = FALSE--------------------------------------- +# if (!dir.exists(cmdstan_default_path())) { +# install_cmdstan() +# } + +## ----install_cmdstan-2, eval=FALSE-------------------------------------------- +# install_cmdstan(cores = 2) + +## ----set_cmdstan_path, eval=FALSE--------------------------------------------- +# set_cmdstan_path(PATH_TO_CMDSTAN) + +## ----cmdstan_path------------------------------------------------------------- +# cmdstan_path() +# cmdstan_version() + +## ----cmdstan_model------------------------------------------------------------ +# file <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli.stan") +# mod <- cmdstan_model(file) + +## ----compile------------------------------------------------------------------ +# mod$print() + +## ----exe_file----------------------------------------------------------------- +# mod$exe_file() + +## ----sample------------------------------------------------------------------- +# # names correspond to the data block in the Stan program +# data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) +# +# fit <- mod$sample( +# data = data_list, +# seed = 123, +# chains = 4, +# parallel_chains = 4, +# refresh = 500 # print update every 500 iters +# ) + +## ----summary, eval=FALSE------------------------------------------------------ +# fit$summary() +# fit$summary(variables = c("theta", "lp__"), "mean", "sd") +# +# # use a formula to summarize arbitrary functions, e.g. Pr(theta <= 0.5) +# fit$summary("theta", pr_lt_half = ~ mean(. <= 0.5)) +# +# # summarise all variables with default and additional summary measures +# fit$summary( +# variables = NULL, +# posterior::default_summary_measures(), +# extra_quantiles = ~posterior::quantile2(., probs = c(.0275, .975)) +# ) + +## ---- echo=FALSE-------------------------------------------------------------- +# # NOTE: the hack of using print.data.frame in chunks with echo=FALSE +# # is used because the pillar formatting of posterior draws_summary objects +# # isn't playing nicely with pkgdown::build_articles(). +# options(digits = 2) +# print.data.frame(fit$summary()) +# print.data.frame(fit$summary(variables = c("theta", "lp__"), "mean", "sd")) +# print.data.frame(fit$summary("theta", pr_lt_half = ~ mean(. <= 0.5))) +# print.data.frame(fit$summary( +# variables = NULL, +# posterior::default_summary_measures(), +# extra_quantiles = ~posterior::quantile2(., probs = c(.0275, .975)) +# )) + +## ----draws, message=FALSE----------------------------------------------------- +# # default is a 3-D draws_array object from the posterior package +# # iterations x chains x variables +# draws_arr <- fit$draws() # or format="array" +# str(draws_arr) +# +# # draws x variables data frame +# draws_df <- fit$draws(format = "df") +# str(draws_df) +# print(draws_df) + +## ----as_draws----------------------------------------------------------------- +# # this should be identical to draws_df created via draws(format = "df") +# draws_df_2 <- as_draws_df(draws_arr) +# identical(draws_df, draws_df_2) + +## ----plots, message=FALSE----------------------------------------------------- +# mcmc_hist(fit$draws("theta")) + +## ----sampler_diagnostics------------------------------------------------------ +# # this is a draws_array object from the posterior package +# str(fit$sampler_diagnostics()) +# +# # this is a draws_df object from the posterior package +# str(fit$sampler_diagnostics(format = "df")) + +## ----diagnostic_summary------------------------------------------------------- +# fit$diagnostic_summary() + +## ----fit-with-warnings, results='hold'---------------------------------------- +# fit_with_warning <- cmdstanr_example("schools") + +## ----diagnostic_summary-with-warnings----------------------------------------- +# diagnostics <- fit_with_warning$diagnostic_summary() +# print(diagnostics) +# +# # number of divergences reported in warning is the sum of the per chain values +# sum(diagnostics$num_divergent) + +## ----stanfit, eval=FALSE------------------------------------------------------ +# stanfit <- rstan::read_stan_csv(fit$output_files()) + +## ----optimize----------------------------------------------------------------- +# fit_mle <- mod$optimize(data = data_list, seed = 123) +# fit_mle$print() # includes lp__ (log prob calculated by Stan program) +# fit_mle$mle("theta") + +## ----plot-mle, message = FALSE------------------------------------------------ +# mcmc_hist(fit$draws("theta")) + +# vline_at(fit_mle$mle("theta"), size = 1.5) + +## ----optimize-map------------------------------------------------------------- +# fit_map <- mod$optimize( +# data = data_list, +# jacobian = TRUE, +# seed = 123 +# ) + +## ----laplace------------------------------------------------------------------ +# fit_laplace <- mod$laplace( +# mode = fit_map, +# draws = 4000, +# data = data_list, +# seed = 123, +# refresh = 1000 +# ) +# fit_laplace$print("theta") +# mcmc_hist(fit_laplace$draws("theta"), binwidth = 0.025) + +## ----variational-------------------------------------------------------------- +# fit_vb <- mod$variational( +# data = data_list, +# seed = 123, +# draws = 4000 +# ) +# fit_vb$print("theta") +# mcmc_hist(fit_vb$draws("theta"), binwidth = 0.025) + +## ----pathfinder--------------------------------------------------------------- +# fit_pf <- mod$pathfinder( +# data = data_list, +# seed = 123, +# draws = 4000 +# ) +# fit_pf$print("theta") + +## ----plot-compare-pf, message = FALSE----------------------------------------- +# mcmc_hist(fit_pf$draws("theta"), binwidth = 0.025) + +# ggplot2::labs(subtitle = "Approximate posterior from pathfinder") + +# ggplot2::xlim(0, 1) + +## ----plot-compare-vb, message = FALSE----------------------------------------- +# mcmc_hist(fit_vb$draws("theta"), binwidth = 0.025) + +# ggplot2::labs(subtitle = "Approximate posterior from variational") + +# ggplot2::xlim(0, 1) + +## ----plot-compare-laplace, message = FALSE------------------------------------ +# mcmc_hist(fit_laplace$draws("theta"), binwidth = 0.025) + +# ggplot2::labs(subtitle = "Approximate posterior from Laplace") + +# ggplot2::xlim(0, 1) + +## ----plot-compare-mcmc, message = FALSE--------------------------------------- +# mcmc_hist(fit$draws("theta"), binwidth = 0.025) + +# ggplot2::labs(subtitle = "Posterior from MCMC") + +# ggplot2::xlim(0, 1) + +## ----save_object, eval=FALSE-------------------------------------------------- +# fit$save_object(file = "fit.RDS") +# +# # can be read back in using readRDS +# fit2 <- readRDS("fit.RDS") + +## ----save_object_qs_full, eval = FALSE---------------------------------------- +# # Load CmdStan output files into the fitted model object. +# fit$draws() # Load posterior draws into the object. +# try(fit$sampler_diagnostics(), silent = TRUE) # Load sampler diagnostics. +# try(fit$init(), silent = TRUE) # Load user-defined initial values. +# try(fit$profiles(), silent = TRUE) # Load profiling samples. +# +# # Save the object to a file. +# qs::qsave(x = fit, file = "fit.qs") +# +# # Read the object. +# fit2 <- qs::qread("fit.qs") + +## ----save_object_qs_small, eval = FALSE--------------------------------------- +# # Load posterior draws into the fitted model object and omit other output. +# fit$draws() +# +# # Save the object to a file. +# qs::qsave(x = fit, file = "fit.qs") +# +# # Read the object. +# fit2 <- qs::qread("fit.qs") + diff --git a/doc/cmdstanr.Rmd b/doc/cmdstanr.Rmd new file mode 100644 index 000000000..b9b2e3414 --- /dev/null +++ b/doc/cmdstanr.Rmd @@ -0,0 +1,554 @@ +--- +title: "Getting started with CmdStanR" +author: "Jonah Gabry, Rok Češnovar, and Andrew Johnson" +output: + rmarkdown::html_vignette: + toc: true + toc_depth: 3 +params: + EVAL: !r identical(Sys.getenv("NOT_CRAN"), "true") +vignette: > + %\VignetteIndexEntry{Getting started with CmdStanR} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r child="children/_settings-knitr.Rmd"} +``` + +## Introduction + +CmdStanR (Command Stan R) is a lightweight interface to +[Stan](https://mc-stan.org/) for R users that provides an alternative to the +traditional [RStan](https://mc-stan.org/rstan/) interface. See the [*Comparison +with RStan*](#comparison-with-rstan) section later in this vignette for more +details on how the two interfaces differ. + +Using CmdStanR requires installing the **cmdstanr** R package and also +CmdStan, the command line interface to Stan. First we install the R package +by running the following command in R. + +```{r install, eval=FALSE} +# we recommend running this is a fresh R session or restarting your current session +install.packages("cmdstanr", repos = c("https://mc-stan.org/r-packages/", getOption("repos"))) +``` + +We can now load the package like any other R package. We'll also load the +**bayesplot** and **posterior** packages to use later in examples. + +```{r library, message=FALSE} +library(cmdstanr) +library(posterior) +library(bayesplot) +color_scheme_set("brightblue") +``` + +## Installing CmdStan + +CmdStanR requires a working installation of +[CmdStan](https://mc-stan.org/users/interfaces/cmdstan.html), the shell +interface to Stan. If you don't have CmdStan installed then CmdStanR can install +it for you, assuming you have a suitable C++ toolchain. The requirements are +described in the CmdStan Guide: + +* https://mc-stan.org/docs/cmdstan-guide/cmdstan-installation.html + +To double check that your toolchain is set up properly you can call +the `check_cmdstan_toolchain()` function: + +```{r check-toolchain} +check_cmdstan_toolchain() +``` + +If your toolchain is configured correctly then CmdStan can be installed by +calling the +[`install_cmdstan()`](https://mc-stan.org/cmdstanr/reference/install_cmdstan.html) +function: + +```{r install_cmdstan-1, include = FALSE} +if (!dir.exists(cmdstan_default_path())) { + install_cmdstan() +} +``` +```{r install_cmdstan-2, eval=FALSE} +install_cmdstan(cores = 2) +``` + +Before CmdStanR can be used it needs to know where the CmdStan installation is +located. When the package is loaded it tries to help automate this to avoid +having to manually set the path every session: + +1. If the environment variable `"CMDSTAN"` exists at load time then its value +will be automatically set as the default path to CmdStan for the R session. This +is useful if your CmdStan installation is not located in the default directory +that would have been used by `install_cmdstan()` (see #2). + +2. If no environment variable is found when loaded but any directory in the form +`".cmdstan/cmdstan-[version]"`, for example `".cmdstan/cmdstan-2.23.0"`, +exists in the user's home directory (`Sys.getenv("HOME")`, +*not* the current working directory) then the path to the CmdStan with the +largest version number will be set as the path to CmdStan for the R session. +This is the same as the default directory that `install_cmdstan()` uses to +install the latest version of CmdStan, so if that's how you installed CmdStan +you shouldn't need to manually set the path to CmdStan when loading CmdStanR. + +If neither of these applies (or you want to subsequently change the path) you +can use the `set_cmdstan_path()` function: + +```{r set_cmdstan_path, eval=FALSE} +set_cmdstan_path(PATH_TO_CMDSTAN) +``` + +To check the path to the CmdStan installation and the CmdStan version number +you can use `cmdstan_path()` and `cmdstan_version()`: + +```{r cmdstan_path} +cmdstan_path() +cmdstan_version() +``` + +## Compiling a model + +The `cmdstan_model()` function creates a new +[`CmdStanModel`](https://mc-stan.org/cmdstanr/reference/CmdStanModel.html) +object from a file containing a Stan program. Under the hood, CmdStan is called +to translate a Stan program to C++ and create a compiled executable. Here we'll +use the example Stan program that comes with the CmdStan installation: + +```{r cmdstan_model} +file <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli.stan") +mod <- cmdstan_model(file) +``` + +The object `mod` is an [R6](https://r6.r-lib.org/) reference object of class +[`CmdStanModel`](https://mc-stan.org/cmdstanr/reference/CmdStanModel.html) and +behaves similarly to R's reference class objects and those in object oriented +programming languages. Methods are accessed using the `$` operator. This design +choice allows for CmdStanR and +[CmdStanPy](https://github.com/stan-dev/cmdstanpy) to provide a similar user +experience and share many implementation details. + +The Stan program can be printed using the `$print()` method: + +```{r compile} +mod$print() +``` + +The path to the compiled executable is returned by the `$exe_file()` +method: + +```{r exe_file} +mod$exe_file() +``` + +## Running MCMC + +The +[`$sample()`](https://mc-stan.org/cmdstanr/reference/model-method-sample.html) +method for +[`CmdStanModel`](https://mc-stan.org/cmdstanr/reference/CmdStanModel.html) +objects runs Stan's default MCMC algorithm. The `data` argument accepts a named +list of R objects (like for RStan) or a path to a data file compatible with +CmdStan (JSON or R dump). + +```{r sample} +# names correspond to the data block in the Stan program +data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) + +fit <- mod$sample( + data = data_list, + seed = 123, + chains = 4, + parallel_chains = 4, + refresh = 500 # print update every 500 iters +) +``` + +There are many more arguments that can be passed to the `$sample()` method. +For details follow this link to its separate documentation page: + +* [`$sample()`](https://mc-stan.org/cmdstanr/reference/model-method-sample.html) + +The `$sample()` method creates [R6](https://r6.r-lib.org/) `CmdStanMCMC` +objects, which have many associated methods. Below we will demonstrate some of +the most important methods. For a full list, follow this link to the +`CmdStanMCMC` documentation: + +* [`CmdStanMCMC`](https://mc-stan.org/cmdstanr/reference/CmdStanMCMC.html) + +### Posterior summary statistics + +#### Summaries from the posterior package + +The +[`$summary()`](https://mc-stan.org/cmdstanr/reference/fit-method-summary.html) +method calls `summarise_draws()` from the **posterior** package. The +first argument specifies the variables to summarize and any arguments +after that are passed on to `posterior::summarise_draws()` to specify +which summaries to compute, whether to use multiple cores, etc. + +```{r summary, eval=FALSE} +fit$summary() +fit$summary(variables = c("theta", "lp__"), "mean", "sd") + +# use a formula to summarize arbitrary functions, e.g. Pr(theta <= 0.5) +fit$summary("theta", pr_lt_half = ~ mean(. <= 0.5)) + +# summarise all variables with default and additional summary measures +fit$summary( + variables = NULL, + posterior::default_summary_measures(), + extra_quantiles = ~posterior::quantile2(., probs = c(.0275, .975)) +) +``` + +```{r, echo=FALSE} + # NOTE: the hack of using print.data.frame in chunks with echo=FALSE + # is used because the pillar formatting of posterior draws_summary objects + # isn't playing nicely with pkgdown::build_articles(). + options(digits = 2) + print.data.frame(fit$summary()) + print.data.frame(fit$summary(variables = c("theta", "lp__"), "mean", "sd")) + print.data.frame(fit$summary("theta", pr_lt_half = ~ mean(. <= 0.5))) + print.data.frame(fit$summary( + variables = NULL, + posterior::default_summary_measures(), + extra_quantiles = ~posterior::quantile2(., probs = c(.0275, .975)) + )) +``` + +#### CmdStan's stansummary utility + +CmdStan itself provides a `stansummary` utility that can be called using the +`$cmdstan_summary()` method. This method will print summaries but won't return +anything. + +### Posterior draws + +#### Extracting draws + +The [`$draws()`](https://mc-stan.org/cmdstanr/reference/fit-method-draws.html) +method can be used to extract the posterior draws in formats provided by the +[**posterior**](https://mc-stan.org/posterior/) package. Here we demonstrate +only the `draws_array` and `draws_df` formats, but the **posterior** package +supports other useful formats as well. + +```{r draws, message=FALSE} +# default is a 3-D draws_array object from the posterior package +# iterations x chains x variables +draws_arr <- fit$draws() # or format="array" +str(draws_arr) + +# draws x variables data frame +draws_df <- fit$draws(format = "df") +str(draws_df) +print(draws_df) +``` + +To convert an existing draws object to a different format use the +`posterior::as_draws_*()` functions. + +```{r as_draws} +# this should be identical to draws_df created via draws(format = "df") +draws_df_2 <- as_draws_df(draws_arr) +identical(draws_df, draws_df_2) +``` + +In general, converting to a different draws format in this way will be slower +than just setting the appropriate format initially in the call to the `$draws()` +method, but in most cases the speed difference will be minor. + +#### Plotting draws + +Plotting posterior distributions is as easy as passing the object returned by +the `$draws()` method directly to plotting functions in our +[**bayesplot**](https://mc-stan.org/bayesplot/) package. + +```{r plots, message=FALSE} +mcmc_hist(fit$draws("theta")) +``` + +### Sampler diagnostics + +#### Extracting diagnostic values for each iteration and chain + +The +[`$sampler_diagnostics()`](https://mc-stan.org/cmdstanr/reference/fit-method-sampler_diagnostics.html) +method extracts the values of the sampler parameters (`treedepth__`, +`divergent__`, etc.) in formats supported by the **posterior** package. The +default is as a 3-D array (iteration x chain x variable). + +```{r sampler_diagnostics} +# this is a draws_array object from the posterior package +str(fit$sampler_diagnostics()) + +# this is a draws_df object from the posterior package +str(fit$sampler_diagnostics(format = "df")) +``` + +#### Sampler diagnostic warnings and summaries + +The `$diagnostic_summary()` method will display any sampler diagnostic warnings and return a summary of diagnostics for each chain. + +```{r diagnostic_summary} +fit$diagnostic_summary() +``` + +We see the number of divergences for each of the four chains, the number +of times the maximum treedepth was hit for each chain, and the E-BFMI +for each chain. + +In this case there were no warnings, so in order to demonstrate the warning +messages we'll use one of the CmdStanR example models that suffers from +divergences. + +```{r fit-with-warnings, results='hold'} +fit_with_warning <- cmdstanr_example("schools") +``` +After fitting there is a warning about divergences. We can also regenerate this warning message later using `fit$diagnostic_summary()`. + +```{r diagnostic_summary-with-warnings} +diagnostics <- fit_with_warning$diagnostic_summary() +print(diagnostics) + +# number of divergences reported in warning is the sum of the per chain values +sum(diagnostics$num_divergent) +``` + +#### CmdStan's diagnose utility + +CmdStan itself provides a `diagnose` utility that can be called using +the `$cmdstan_diagnose()` method. This method will print warnings but won't return anything. + + +### Create a `stanfit` object + +If you have RStan installed then it is also possible to create a `stanfit` +object from the csv output files written by CmdStan. This can be done by using +`rstan::read_stan_csv()` in combination with the `$output_files()` method of the +`CmdStanMCMC` object. This is only needed if you want to fit a model with +CmdStanR but already have a lot of post-processing code that assumes a `stanfit` +object. Otherwise we recommend using the post-processing functionality provided +by CmdStanR itself. + +```{r stanfit, eval=FALSE} +stanfit <- rstan::read_stan_csv(fit$output_files()) +``` + + + +## Running optimization and variational inference + +CmdStanR also supports running Stan's optimization algorithms and its algorithms +for variational approximation of full Bayesian inference. These are run via the +`$optimize()`, `$laplace()`, `$variational()`, and `$pathfinder()` methods, which +are called in a similar way to the `$sample()` method demonstrated above. + +### Optimization + +We can find the (penalized) maximum likelihood estimate (MLE) using [`$optimize()`](https://mc-stan.org/cmdstanr/reference/model-method-optimize.html). + +```{r optimize} +fit_mle <- mod$optimize(data = data_list, seed = 123) +fit_mle$print() # includes lp__ (log prob calculated by Stan program) +fit_mle$mle("theta") +``` + +Here's a plot comparing the penalized MLE to the posterior distribution of +`theta`. + +```{r plot-mle, message = FALSE} +mcmc_hist(fit$draws("theta")) + + vline_at(fit_mle$mle("theta"), size = 1.5) +``` + +For optimization, by default the mode is calculated without the Jacobian +adjustment for constrained variables, which shifts the mode due to the change of +variables. To include the Jacobian adjustment and obtain a maximum a posteriori +(MAP) estimate set `jacobian=TRUE`. See the +[Maximum Likelihood Estimation](https://mc-stan.org/docs/cmdstan-guide/maximum-likelihood-estimation.html) +section of the CmdStan User's Guide for more details. + +```{r optimize-map} +fit_map <- mod$optimize( + data = data_list, + jacobian = TRUE, + seed = 123 +) +``` + +### Laplace Approximation + +The [`$laplace()`](https://mc-stan.org/cmdstanr/reference/model-method-laplace.html) +method produces a sample from a normal approximation centered at the mode of a +distribution in the unconstrained space. If the mode is a MAP estimate, the +samples provide an estimate of the mean and standard deviation of the posterior +distribution. If the mode is the MLE, the sample provides an estimate of the +standard error of the likelihood. Whether the mode is the MAP or MLE depends on +the value of the `jacobian` argument when running optimization. See the +[Laplace Sampling](https://mc-stan.org/docs/cmdstan-guide/laplace-sampling.html) +chapter of the CmdStan User's Guide for more details. + +Here we pass in the `fit_map` object from above as the `mode` argument. If +`mode` is omitted then optimization will be run internally before taking draws +from the normal approximation. + +```{r laplace} +fit_laplace <- mod$laplace( + mode = fit_map, + draws = 4000, + data = data_list, + seed = 123, + refresh = 1000 + ) +fit_laplace$print("theta") +mcmc_hist(fit_laplace$draws("theta"), binwidth = 0.025) +``` + +### Variational (ADVI) + +We can run Stan's experimental Automatic Differentiation Variational Inference +(ADVI) using the [`$variational()`](https://mc-stan.org/cmdstanr/reference/model-method-variational.html) +method. For details on the ADVI algorithm see the +[CmdStan User's Guide](https://mc-stan.org/docs/cmdstan-guide/variational-inference-algorithm-advi.html). + + +```{r variational} +fit_vb <- mod$variational( + data = data_list, + seed = 123, + draws = 4000 +) +fit_vb$print("theta") +mcmc_hist(fit_vb$draws("theta"), binwidth = 0.025) +``` + +### Variational (Pathfinder) + +Stan version 2.33 introduced a new variational method called Pathfinder, +which is intended to be faster and more stable than ADVI. For details on how +Pathfinder works see the section in the +[CmdStan User's Guide](https://mc-stan.org/docs/cmdstan-guide/pathfinder-intro.html#pathfinder-intro). +Pathfinder is run using the [`$pathfinder()`](https://mc-stan.org/cmdstanr/reference/model-method-pathfinder.html) +method. + +```{r pathfinder} +fit_pf <- mod$pathfinder( + data = data_list, + seed = 123, + draws = 4000 +) +fit_pf$print("theta") +``` + + +Let's extract the draws, make the same plot we made after running the other +algorithms, and compare them all. approximation, and compare them all. In this +simple example the distributions are quite similar, but this will not always be +the case for more challenging problems. + +```{r plot-compare-pf, message = FALSE} +mcmc_hist(fit_pf$draws("theta"), binwidth = 0.025) + + ggplot2::labs(subtitle = "Approximate posterior from pathfinder") + + ggplot2::xlim(0, 1) +``` +```{r plot-compare-vb, message = FALSE} +mcmc_hist(fit_vb$draws("theta"), binwidth = 0.025) + + ggplot2::labs(subtitle = "Approximate posterior from variational") + + ggplot2::xlim(0, 1) +``` +```{r plot-compare-laplace, message = FALSE} +mcmc_hist(fit_laplace$draws("theta"), binwidth = 0.025) + + ggplot2::labs(subtitle = "Approximate posterior from Laplace") + + ggplot2::xlim(0, 1) +``` +```{r plot-compare-mcmc, message = FALSE} +mcmc_hist(fit$draws("theta"), binwidth = 0.025) + + ggplot2::labs(subtitle = "Posterior from MCMC") + + ggplot2::xlim(0, 1) +``` + +For more details on the `$optimize()`, `$laplace()`, `$variational()`, and +`pathfinder()` methods, follow these links to their documentation pages. + +* [`$optimize()`](https://mc-stan.org/cmdstanr/reference/model-method-optimize.html) +* [`$laplace()`](https://mc-stan.org/cmdstanr/reference/model-method-laplace.html) +* [`$variational()`](https://mc-stan.org/cmdstanr/reference/model-method-variational.html) +* [`$pathfinder()`](https://mc-stan.org/cmdstanr/reference/model-method-pathfinder.html) + + +## Saving fitted model objects + +The [`$save_object()`](http://mc-stan.org/cmdstanr/reference/fit-method-save_object.html) +method provided by CmdStanR is the most convenient way to save a fitted model object +to disk and ensure that all of the contents are available when reading the object back into R. + +```{r save_object, eval=FALSE} +fit$save_object(file = "fit.RDS") + +# can be read back in using readRDS +fit2 <- readRDS("fit.RDS") +``` + +But if your model object is large, then +[`$save_object()`](http://mc-stan.org/cmdstanr/reference/fit-method-save_object.html) +could take a long time. +[`$save_object()`](http://mc-stan.org/cmdstanr/reference/fit-method-save_object.html) +reads the CmdStan results files into memory, stores them in the model object, +and saves the object with `saveRDS()`. To speed up the process, you can emulate +[`$save_object()`](http://mc-stan.org/cmdstanr/reference/fit-method-save_object.html) +and replace `saveRDS` with the much faster `qsave()` function from the +[`qs`](https://github.com/traversc/qs) package. + +```{r save_object_qs_full, eval = FALSE} +# Load CmdStan output files into the fitted model object. +fit$draws() # Load posterior draws into the object. +try(fit$sampler_diagnostics(), silent = TRUE) # Load sampler diagnostics. +try(fit$init(), silent = TRUE) # Load user-defined initial values. +try(fit$profiles(), silent = TRUE) # Load profiling samples. + +# Save the object to a file. +qs::qsave(x = fit, file = "fit.qs") + +# Read the object. +fit2 <- qs::qread("fit.qs") +``` + +Storage is even faster if you discard results you do not need to save. +The following example saves only posterior draws and discards +sampler diagnostics, user-specified initial values, and profiling data. + +```{r save_object_qs_small, eval = FALSE} +# Load posterior draws into the fitted model object and omit other output. +fit$draws() + +# Save the object to a file. +qs::qsave(x = fit, file = "fit.qs") + +# Read the object. +fit2 <- qs::qread("fit.qs") +``` + +See the vignette [_How does CmdStanR work?_](http://mc-stan.org/cmdstanr/articles/cmdstanr-internals.html) +for more information about the composition of CmdStanR objects. + +## Comparison with RStan + +```{r child="children/comparison-with-rstan.md"} +``` + +## Additional resources + +There are additional vignettes available that discuss other aspects of using +CmdStanR. These can be found online at the CmdStanR website: + +* https://mc-stan.org/cmdstanr/articles/index.html + +To ask a question please post on the Stan forums: + +* https://discourse.mc-stan.org/ + +To report a bug, suggest a feature (including additions to these vignettes), or to start contributing to CmdStanR +development (new contributors welcome!) please open an issue on GitHub: + +* https://github.com/stan-dev/cmdstanr/issues diff --git a/doc/cmdstanr.html b/doc/cmdstanr.html new file mode 100644 index 000000000..95a23794d --- /dev/null +++ b/doc/cmdstanr.html @@ -0,0 +1,814 @@ + + + + + + + + + + + + + + + +Getting started with CmdStanR + + + + + + + + + + + + + + + + + + + + + + + + + + +

Getting started with CmdStanR

+

Jonah Gabry, Rok Češnovar, and Andrew Johnson

+ + + + +
+

Introduction

+

CmdStanR (Command Stan R) is a lightweight interface to Stan for R users that provides an +alternative to the traditional RStan interface. See the Comparison with RStan section +later in this vignette for more details on how the two interfaces +differ.

+

Using CmdStanR requires installing the cmdstanr R +package and also CmdStan, the command line interface to Stan. First we +install the R package by running the following command in R.

+
# we recommend running this is a fresh R session or restarting your current session
+install.packages("cmdstanr", repos = c("https://mc-stan.org/r-packages/", getOption("repos")))
+

We can now load the package like any other R package. We’ll also load +the bayesplot and posterior packages +to use later in examples.

+
library(cmdstanr)
+library(posterior)
+library(bayesplot)
+color_scheme_set("brightblue")
+
+
+

Installing CmdStan

+

CmdStanR requires a working installation of CmdStan, +the shell interface to Stan. If you don’t have CmdStan installed then +CmdStanR can install it for you, assuming you have a suitable C++ +toolchain. The requirements are described in the CmdStan Guide:

+ +

To double check that your toolchain is set up properly you can call +the check_cmdstan_toolchain() function:

+
check_cmdstan_toolchain()
+

If your toolchain is configured correctly then CmdStan can be +installed by calling the install_cmdstan() +function:

+
install_cmdstan(cores = 2)
+

Before CmdStanR can be used it needs to know where the CmdStan +installation is located. When the package is loaded it tries to help +automate this to avoid having to manually set the path every +session:

+
    +
  1. If the environment variable "CMDSTAN" exists at load +time then its value will be automatically set as the default path to +CmdStan for the R session. This is useful if your CmdStan installation +is not located in the default directory that would have been used by +install_cmdstan() (see #2).

  2. +
  3. If no environment variable is found when loaded but any directory +in the form ".cmdstan/cmdstan-[version]", for example +".cmdstan/cmdstan-2.23.0", exists in the user’s home +directory (Sys.getenv("HOME"), not the current +working directory) then the path to the CmdStan with the largest version +number will be set as the path to CmdStan for the R session. This is the +same as the default directory that install_cmdstan() uses +to install the latest version of CmdStan, so if that’s how you installed +CmdStan you shouldn’t need to manually set the path to CmdStan when +loading CmdStanR.

  4. +
+

If neither of these applies (or you want to subsequently change the +path) you can use the set_cmdstan_path() function:

+
set_cmdstan_path(PATH_TO_CMDSTAN)
+

To check the path to the CmdStan installation and the CmdStan version +number you can use cmdstan_path() and +cmdstan_version():

+
cmdstan_path()
+cmdstan_version()
+
+
+

Compiling a model

+

The cmdstan_model() function creates a new CmdStanModel +object from a file containing a Stan program. Under the hood, CmdStan is +called to translate a Stan program to C++ and create a compiled +executable. Here we’ll use the example Stan program that comes with the +CmdStan installation:

+
file <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli.stan")
+mod <- cmdstan_model(file)
+

The object mod is an R6 reference object of class CmdStanModel +and behaves similarly to R’s reference class objects and those in object +oriented programming languages. Methods are accessed using the +$ operator. This design choice allows for CmdStanR and CmdStanPy to provide a +similar user experience and share many implementation details.

+

The Stan program can be printed using the $print() +method:

+
mod$print()
+

The path to the compiled executable is returned by the +$exe_file() method:

+
mod$exe_file()
+
+
+

Running MCMC

+

The $sample() +method for CmdStanModel +objects runs Stan’s default MCMC algorithm. The data +argument accepts a named list of R objects (like for RStan) or a path to +a data file compatible with CmdStan (JSON or R dump).

+
# names correspond to the data block in the Stan program
+data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1))
+
+fit <- mod$sample(
+  data = data_list,
+  seed = 123,
+  chains = 4,
+  parallel_chains = 4,
+  refresh = 500 # print update every 500 iters
+)
+

There are many more arguments that can be passed to the +$sample() method. For details follow this link to its +separate documentation page:

+ +

The $sample() method creates R6 CmdStanMCMC objects, +which have many associated methods. Below we will demonstrate some of +the most important methods. For a full list, follow this link to the +CmdStanMCMC documentation:

+ +
+

Posterior summary statistics

+
+

Summaries from the posterior package

+

The $summary() +method calls summarise_draws() from the +posterior package. The first argument specifies the +variables to summarize and any arguments after that are passed on to +posterior::summarise_draws() to specify which summaries to +compute, whether to use multiple cores, etc.

+
fit$summary()
+fit$summary(variables = c("theta", "lp__"), "mean", "sd")
+
+# use a formula to summarize arbitrary functions, e.g. Pr(theta <= 0.5)
+fit$summary("theta", pr_lt_half = ~ mean(. <= 0.5))
+
+# summarise all variables with default and additional summary measures
+fit$summary(
+  variables = NULL,
+  posterior::default_summary_measures(),
+  extra_quantiles = ~posterior::quantile2(., probs = c(.0275, .975))
+)
+
+
+

CmdStan’s stansummary utility

+

CmdStan itself provides a stansummary utility that can +be called using the $cmdstan_summary() method. This method +will print summaries but won’t return anything.

+
+
+
+

Posterior draws

+
+

Extracting draws

+

The $draws() +method can be used to extract the posterior draws in formats provided by +the posterior +package. Here we demonstrate only the draws_array and +draws_df formats, but the posterior +package supports other useful formats as well.

+
# default is a 3-D draws_array object from the posterior package
+# iterations x chains x variables
+draws_arr <- fit$draws() # or format="array"
+str(draws_arr)
+
+# draws x variables data frame
+draws_df <- fit$draws(format = "df")
+str(draws_df)
+print(draws_df)
+

To convert an existing draws object to a different format use the +posterior::as_draws_*() functions.

+
# this should be identical to draws_df created via draws(format = "df")
+draws_df_2 <- as_draws_df(draws_arr)
+identical(draws_df, draws_df_2)
+

In general, converting to a different draws format in this way will +be slower than just setting the appropriate format initially in the call +to the $draws() method, but in most cases the speed +difference will be minor.

+
+
+

Plotting draws

+

Plotting posterior distributions is as easy as passing the object +returned by the $draws() method directly to plotting +functions in our bayesplot +package.

+
mcmc_hist(fit$draws("theta"))
+
+
+
+

Sampler diagnostics

+
+

Extracting diagnostic values for each iteration and chain

+

The $sampler_diagnostics() +method extracts the values of the sampler parameters +(treedepth__, divergent__, etc.) in formats +supported by the posterior package. The default is as a +3-D array (iteration x chain x variable).

+
# this is a draws_array object from the posterior package
+str(fit$sampler_diagnostics())
+
+# this is a draws_df object from the posterior package
+str(fit$sampler_diagnostics(format = "df"))
+
+
+

Sampler diagnostic warnings and summaries

+

The $diagnostic_summary() method will display any +sampler diagnostic warnings and return a summary of diagnostics for each +chain.

+
fit$diagnostic_summary()
+

We see the number of divergences for each of the four chains, the +number of times the maximum treedepth was hit for each chain, and the +E-BFMI for each chain.

+

In this case there were no warnings, so in order to demonstrate the +warning messages we’ll use one of the CmdStanR example models that +suffers from divergences.

+
fit_with_warning <- cmdstanr_example("schools")
+

After fitting there is a warning about divergences. We can also +regenerate this warning message later using +fit$diagnostic_summary().

+
diagnostics <- fit_with_warning$diagnostic_summary()
+print(diagnostics)
+
+# number of divergences reported in warning is the sum of the per chain values
+sum(diagnostics$num_divergent)
+
+
+

CmdStan’s diagnose utility

+

CmdStan itself provides a diagnose utility that can be +called using the $cmdstan_diagnose() method. This method +will print warnings but won’t return anything.

+
+
+
+

Create a stanfit object

+

If you have RStan installed then it is also possible to create a +stanfit object from the csv output files written by +CmdStan. This can be done by using rstan::read_stan_csv() +in combination with the $output_files() method of the +CmdStanMCMC object. This is only needed if you want to fit +a model with CmdStanR but already have a lot of post-processing code +that assumes a stanfit object. Otherwise we recommend using +the post-processing functionality provided by CmdStanR itself.

+
stanfit <- rstan::read_stan_csv(fit$output_files())
+
+
+
+

Running optimization and variational inference

+

CmdStanR also supports running Stan’s optimization algorithms and its +algorithms for variational approximation of full Bayesian inference. +These are run via the $optimize(), $laplace(), +$variational(), and $pathfinder() methods, +which are called in a similar way to the $sample() method +demonstrated above.

+
+

Optimization

+

We can find the (penalized) maximum likelihood estimate (MLE) using +$optimize().

+
fit_mle <- mod$optimize(data = data_list, seed = 123)
+fit_mle$print() # includes lp__ (log prob calculated by Stan program)
+fit_mle$mle("theta")
+

Here’s a plot comparing the penalized MLE to the posterior +distribution of theta.

+
mcmc_hist(fit$draws("theta")) +
+  vline_at(fit_mle$mle("theta"), size = 1.5)
+

For optimization, by default the mode is calculated without the +Jacobian adjustment for constrained variables, which shifts the mode due +to the change of variables. To include the Jacobian adjustment and +obtain a maximum a posteriori (MAP) estimate set +jacobian=TRUE. See the Maximum +Likelihood Estimation section of the CmdStan User’s Guide for more +details.

+
fit_map <- mod$optimize(
+  data = data_list,
+  jacobian = TRUE,
+  seed = 123
+)
+
+
+

Laplace Approximation

+

The $laplace() +method produces a sample from a normal approximation centered at the +mode of a distribution in the unconstrained space. If the mode is a MAP +estimate, the samples provide an estimate of the mean and standard +deviation of the posterior distribution. If the mode is the MLE, the +sample provides an estimate of the standard error of the likelihood. +Whether the mode is the MAP or MLE depends on the value of the +jacobian argument when running optimization. See the Laplace +Sampling chapter of the CmdStan User’s Guide for more details.

+

Here we pass in the fit_map object from above as the +mode argument. If mode is omitted then +optimization will be run internally before taking draws from the normal +approximation.

+
fit_laplace <- mod$laplace(
+    mode = fit_map,
+    draws = 4000,
+    data = data_list,
+    seed = 123,
+    refresh = 1000
+  )
+fit_laplace$print("theta")
+mcmc_hist(fit_laplace$draws("theta"), binwidth = 0.025)
+
+
+

Variational (ADVI)

+

We can run Stan’s experimental Automatic Differentiation Variational +Inference (ADVI) using the $variational() +method. For details on the ADVI algorithm see the CmdStan +User’s Guide.

+
fit_vb <- mod$variational(
+  data = data_list,
+  seed = 123,
+  draws = 4000
+)
+fit_vb$print("theta")
+mcmc_hist(fit_vb$draws("theta"), binwidth = 0.025)
+
+
+

Variational (Pathfinder)

+

Stan version 2.33 introduced a new variational method called +Pathfinder, which is intended to be faster and more stable than ADVI. +For details on how Pathfinder works see the section in the CmdStan +User’s Guide. Pathfinder is run using the $pathfinder() +method.

+
fit_pf <- mod$pathfinder(
+  data = data_list,
+  seed = 123,
+  draws = 4000
+)
+fit_pf$print("theta")
+

Let’s extract the draws, make the same plot we made after running the +other algorithms, and compare them all. approximation, and compare them +all. In this simple example the distributions are quite similar, but +this will not always be the case for more challenging problems.

+
mcmc_hist(fit_pf$draws("theta"), binwidth = 0.025) +
+  ggplot2::labs(subtitle = "Approximate posterior from pathfinder") +
+  ggplot2::xlim(0, 1)
+
mcmc_hist(fit_vb$draws("theta"), binwidth = 0.025) +
+  ggplot2::labs(subtitle = "Approximate posterior from variational") +
+  ggplot2::xlim(0, 1)
+
mcmc_hist(fit_laplace$draws("theta"), binwidth = 0.025) +
+  ggplot2::labs(subtitle = "Approximate posterior from Laplace") +
+  ggplot2::xlim(0, 1)
+
mcmc_hist(fit$draws("theta"), binwidth = 0.025) +
+  ggplot2::labs(subtitle = "Posterior from MCMC") +
+  ggplot2::xlim(0, 1)
+

For more details on the $optimize(), +$laplace(), $variational(), and +pathfinder() methods, follow these links to their +documentation pages.

+ +
+
+
+

Saving fitted model objects

+

The $save_object() +method provided by CmdStanR is the most convenient way to save a fitted +model object to disk and ensure that all of the contents are available +when reading the object back into R.

+
fit$save_object(file = "fit.RDS")
+
+# can be read back in using readRDS
+fit2 <- readRDS("fit.RDS")
+

But if your model object is large, then $save_object() +could take a long time. $save_object() +reads the CmdStan results files into memory, stores them in the model +object, and saves the object with saveRDS(). To speed up +the process, you can emulate $save_object() +and replace saveRDS with the much faster +qsave() function from the qs package.

+
# Load CmdStan output files into the fitted model object.
+fit$draws() # Load posterior draws into the object.
+try(fit$sampler_diagnostics(), silent = TRUE) # Load sampler diagnostics.
+try(fit$init(), silent = TRUE) # Load user-defined initial values.
+try(fit$profiles(), silent = TRUE) # Load profiling samples.
+
+# Save the object to a file.
+qs::qsave(x = fit, file = "fit.qs")
+
+# Read the object.
+fit2 <- qs::qread("fit.qs")
+

Storage is even faster if you discard results you do not need to +save. The following example saves only posterior draws and discards +sampler diagnostics, user-specified initial values, and profiling +data.

+
# Load posterior draws into the fitted model object and omit other output.
+fit$draws()
+
+# Save the object to a file.
+qs::qsave(x = fit, file = "fit.qs")
+
+# Read the object.
+fit2 <- qs::qread("fit.qs")
+

See the vignette How +does CmdStanR work? for more information about the composition +of CmdStanR objects.

+
+
+

Comparison with RStan

+
+
+

Additional resources

+

There are additional vignettes available that discuss other aspects +of using CmdStanR. These can be found online at the CmdStanR +website:

+ +

To ask a question please post on the Stan forums:

+ +

To report a bug, suggest a feature (including additions to these +vignettes), or to start contributing to CmdStanR development (new +contributors welcome!) please open an issue on GitHub:

+ +
+ + + + + + + + + + + diff --git a/doc/posterior.R b/doc/posterior.R new file mode 100644 index 000000000..a990a31c3 --- /dev/null +++ b/doc/posterior.R @@ -0,0 +1,77 @@ +params <- +list(EVAL = FALSE) + +## ----settings-knitr, include=FALSE-------------------------------------------- +stopifnot(require(knitr)) +opts_chunk$set( + # collapse = TRUE, + dev = "png", + dpi = 150, + fig.asp = 0.618, + fig.width = 5, + out.width = "60%", + fig.align = "center", + comment = NA, + eval = if (isTRUE(exists("params"))) params$EVAL else FALSE +) + +## ----include=FALSE------------------------------------------------------------ +# # Needed temporarily to avoiding weird rendering of posterior's tibbles +# # in pkgdown sites +# print.tbl_df <- function(x, ...) { +# print.data.frame(x) +# } + +## ----------------------------------------------------------------------------- +# fit <- cmdstanr::cmdstanr_example("schools", method = "sample") +# fit$summary() + +## ----------------------------------------------------------------------------- +# posterior::default_summary_measures() + +## ----------------------------------------------------------------------------- +# fit$summary(variables = c("mu", "tau")) + +## ----------------------------------------------------------------------------- +# fit$summary(variables = c("mu", "tau"), mean, sd) + +## ----------------------------------------------------------------------------- +# fit$metadata()$model_params +# fit$summary(variables = NULL, "mean", "median") + +## ----------------------------------------------------------------------------- +# my_sd <- function(x) c(My_SD = sd(x)) +# fit$summary( +# c("mu", "tau"), +# MEAN = mean, +# "median", +# my_sd, +# ~quantile(.x, probs = c(0.1, 0.9)), +# Minimum = function(x) min(x) +# ) + +## ----------------------------------------------------------------------------- +# fit$summary(c("mu", "tau"), quantile, .args = list(probs = c(0.025, .05, .95, .975))) + +## ----------------------------------------------------------------------------- +# fit$summary(variables = NULL, dim, colMeans) + +## ----------------------------------------------------------------------------- +# fit$summary(c("mu", "tau"), posterior::variance, ~var(as.vector(.x))) + +## ----------------------------------------------------------------------------- +# strict_pos <- function(x) if (all(x > 0)) "yes" else "no" +# fit$summary(variables = NULL, "Strictly Positive" = strict_pos) +# # fit$print(variables = NULL, "Strictly Positive" = strict_pos) + +## ----draws, message=FALSE----------------------------------------------------- +# # default is a 3-D draws_array object from the posterior package +# # iterations x chains x variables +# draws_arr <- fit$draws() # or format="array" +# str(draws_arr) +# +# # draws x variables data frame +# draws_df <- fit$draws(format = "df") +# str(draws_df) +# print(draws_df) + diff --git a/doc/posterior.Rmd b/doc/posterior.Rmd new file mode 100644 index 000000000..15da8e681 --- /dev/null +++ b/doc/posterior.Rmd @@ -0,0 +1,126 @@ +--- +title: "Working with Posteriors" +output: + rmarkdown::html_vignette: + toc: true + toc_depth: 3 +params: + EVAL: !r identical(Sys.getenv("NOT_CRAN"), "true") +vignette: > + %\VignetteIndexEntry{Working with Posteriors} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r child="children/_settings-knitr.Rmd"} +``` + +```{r include=FALSE} +# Needed temporarily to avoiding weird rendering of posterior's tibbles +# in pkgdown sites +print.tbl_df <- function(x, ...) { + print.data.frame(x) +} +``` + +## Summary statistics + +We can easily customize the summary statistics reported by `$summary()` and `$print()`. + +```{r} +fit <- cmdstanr::cmdstanr_example("schools", method = "sample") +fit$summary() +``` + +By default all variables are summaries with the follow functions: +```{r} +posterior::default_summary_measures() +``` + +To change the variables summarized, we use the variables argument +```{r} +fit$summary(variables = c("mu", "tau")) +``` + +We can additionally change which functions are used +```{r} +fit$summary(variables = c("mu", "tau"), mean, sd) +``` + +To summarize all variables with non-default functions, it is necessary to set explicitly set the variables argument, either to `NULL` or the full vector of variable names. +```{r} +fit$metadata()$model_params +fit$summary(variables = NULL, "mean", "median") +``` + +Summary functions can be specified by character string, function, or using a +formula (or anything else supported by `rlang::as_function()`). If these +arguments are named, those names will be used in the tibble output. If the +summary results are named they will take precedence. +```{r} +my_sd <- function(x) c(My_SD = sd(x)) +fit$summary( + c("mu", "tau"), + MEAN = mean, + "median", + my_sd, + ~quantile(.x, probs = c(0.1, 0.9)), + Minimum = function(x) min(x) +) +``` + +Arguments to all summary functions can also be specified with `.args`. +```{r} +fit$summary(c("mu", "tau"), quantile, .args = list(probs = c(0.025, .05, .95, .975))) +``` + +The summary functions are applied to the array of sample values, with dimension `iter_sampling`x`chains`. +```{r} +fit$summary(variables = NULL, dim, colMeans) +``` + + +For this reason users may have unexpected results if they use `stats::var()` directly, as it will return a covariance matrix. An alternative is the `distributional::variance()` function, +which can also be accessed via `posterior::variance()`. +```{r} +fit$summary(c("mu", "tau"), posterior::variance, ~var(as.vector(.x))) +``` + +Summary functions need not be numeric, but these won't work with `$print()`. + +```{r} +strict_pos <- function(x) if (all(x > 0)) "yes" else "no" +fit$summary(variables = NULL, "Strictly Positive" = strict_pos) +# fit$print(variables = NULL, "Strictly Positive" = strict_pos) +``` + +For more information, see `posterior::summarise_draws()`, which is called by `$summary()`. + + +## Extracting posterior draws/samples + +The [`$draws()`](https://mc-stan.org/cmdstanr/reference/fit-method-draws.html) +method can be used to extract the posterior draws in formats provided by the +[**posterior**](https://mc-stan.org/posterior/) package. Here we demonstrate +only the `draws_array` and `draws_df` formats, but the **posterior** package +supports other useful formats as well. + +```{r draws, message=FALSE} +# default is a 3-D draws_array object from the posterior package +# iterations x chains x variables +draws_arr <- fit$draws() # or format="array" +str(draws_arr) + +# draws x variables data frame +draws_df <- fit$draws(format = "df") +str(draws_df) +print(draws_df) +``` + +To convert an existing draws object to a different format use the +`posterior::as_draws_*()` functions. + +To manipulate the `draws` objects use the various methods described in the +posterior package [vignettes](https://mc-stan.org/posterior/articles/index.html) +and [documentation](https://mc-stan.org/posterior/reference/index.html). + diff --git a/doc/posterior.html b/doc/posterior.html new file mode 100644 index 000000000..5575a9bc3 --- /dev/null +++ b/doc/posterior.html @@ -0,0 +1,442 @@ + + + + + + + + + + + + + + +Working with Posteriors + + + + + + + + + + + + + + + + + + + + + + + + + + +

Working with Posteriors

+ + + + +
+

Summary statistics

+

We can easily customize the summary statistics reported by +$summary() and $print().

+
fit <- cmdstanr::cmdstanr_example("schools", method = "sample")
+fit$summary()
+

By default all variables are summaries with the follow functions:

+
posterior::default_summary_measures()
+

To change the variables summarized, we use the variables argument

+
fit$summary(variables = c("mu", "tau"))
+

We can additionally change which functions are used

+
fit$summary(variables = c("mu", "tau"), mean, sd)
+

To summarize all variables with non-default functions, it is +necessary to set explicitly set the variables argument, either to +NULL or the full vector of variable names.

+
fit$metadata()$model_params
+fit$summary(variables = NULL, "mean", "median")
+

Summary functions can be specified by character string, function, or +using a formula (or anything else supported by +rlang::as_function()). If these arguments are named, those +names will be used in the tibble output. If the summary results are +named they will take precedence.

+
my_sd <- function(x) c(My_SD = sd(x))
+fit$summary(
+  c("mu", "tau"), 
+  MEAN = mean, 
+  "median",
+  my_sd,
+  ~quantile(.x, probs = c(0.1, 0.9)),
+  Minimum = function(x) min(x)
+)        
+

Arguments to all summary functions can also be specified with +.args.

+
fit$summary(c("mu", "tau"), quantile, .args = list(probs = c(0.025, .05, .95, .975)))
+

The summary functions are applied to the array of sample values, with +dimension iter_samplingxchains.

+
fit$summary(variables = NULL, dim, colMeans)
+

For this reason users may have unexpected results if they use +stats::var() directly, as it will return a covariance +matrix. An alternative is the distributional::variance() +function, which can also be accessed via +posterior::variance().

+
fit$summary(c("mu", "tau"), posterior::variance, ~var(as.vector(.x)))
+

Summary functions need not be numeric, but these won’t work with +$print().

+
strict_pos <- function(x) if (all(x > 0)) "yes" else "no"
+fit$summary(variables = NULL, "Strictly Positive" = strict_pos)
+# fit$print(variables = NULL, "Strictly Positive" = strict_pos)
+

For more information, see posterior::summarise_draws(), +which is called by $summary().

+
+
+

Extracting posterior draws/samples

+

The $draws() +method can be used to extract the posterior draws in formats provided by +the posterior +package. Here we demonstrate only the draws_array and +draws_df formats, but the posterior +package supports other useful formats as well.

+
# default is a 3-D draws_array object from the posterior package
+# iterations x chains x variables
+draws_arr <- fit$draws() # or format="array"
+str(draws_arr)
+
+# draws x variables data frame
+draws_df <- fit$draws(format = "df")
+str(draws_df)
+print(draws_df)
+

To convert an existing draws object to a different format use the +posterior::as_draws_*() functions.

+

To manipulate the draws objects use the various methods +described in the posterior package vignettes +and documentation.

+
+ + + + + + + + + + + diff --git a/doc/profiling.R b/doc/profiling.R new file mode 100644 index 000000000..5a411341c --- /dev/null +++ b/doc/profiling.R @@ -0,0 +1,106 @@ +params <- +list(EVAL = FALSE) + +## ----settings-knitr, include=FALSE-------------------------------------------- +stopifnot(require(knitr)) +opts_chunk$set( + # collapse = TRUE, + dev = "png", + dpi = 150, + fig.asp = 0.618, + fig.width = 5, + out.width = "60%", + fig.align = "center", + comment = NA, + eval = if (isTRUE(exists("params"))) params$EVAL else FALSE +) + +## ----library, message=FALSE--------------------------------------------------- +# library(cmdstanr) +# check_cmdstan_toolchain(fix = TRUE, quiet = TRUE) + +## ----profiling_bernoulli_logit.stan------------------------------------------- +# profiling_bernoulli_logit <- write_stan_file(' +# data { +# int k; +# int n; +# matrix[n, k] X; +# array[n] int y; +# } +# parameters { +# vector[k] beta; +# real alpha; +# } +# model { +# profile("priors") { +# target += std_normal_lpdf(beta); +# target += std_normal_lpdf(alpha); +# } +# profile("likelihood") { +# target += bernoulli_logit_lpmf(y | X * beta + alpha); +# } +# } +# ') + +## ----fit-model, message=FALSE, results='hide'--------------------------------- +# # Compile the model +# model <- cmdstan_model(profiling_bernoulli_logit) +# +# # Generate some fake data +# n <- 1000 +# k <- 20 +# X <- matrix(rnorm(n * k), ncol = k) +# +# y <- 3 * X[,1] - 2 * X[,2] + 1 +# p <- runif(n) +# y <- ifelse(p < (1 / (1 + exp(-y))), 1, 0) +# stan_data <- list(k = ncol(X), n = nrow(X), y = y, X = X) +# +# # Run one chain of the model +# fit <- model$sample(data = stan_data, chains = 1) + +## ----profiles----------------------------------------------------------------- +# fit$profiles() + +## ----profiling_bernoulli_logit_glm.stan--------------------------------------- +# profiling_bernoulli_logit_glm <- write_stan_file(' +# data { +# int k; +# int n; +# matrix[n, k] X; +# array[n] int y; +# } +# parameters { +# vector[k] beta; +# real alpha; +# } +# model { +# profile("priors") { +# target += std_normal_lpdf(beta); +# target += std_normal_lpdf(alpha); +# } +# profile("likelihood") { +# target += bernoulli_logit_glm_lpmf(y | X, alpha, beta); +# } +# } +# ') + +## ----fit-model-glm, message=FALSE, results='hide'----------------------------- +# model_glm <- cmdstan_model(profiling_bernoulli_logit_glm) +# fit_glm <- model_glm$sample(data = stan_data, chains = 1) + +## ----profiles-glm------------------------------------------------------------- +# fit_glm$profiles() + +## ----per-gradient------------------------------------------------------------- +# profile_chain_1 <- fit$profiles()[[1]] +# per_gradient_timing <- profile_chain_1$total_time/profile_chain_1$autodiff_calls +# print(per_gradient_timing) # two elements for the two profile statements in the model + +## ----profile_files------------------------------------------------------------ +# fit$profile_files() + +## ----save_profile_files, eval=FALSE------------------------------------------- +# # see ?save_profile_files for info on optional arguments +# fit$save_profile_files(dir = "path/to/directory") + diff --git a/doc/profiling.Rmd b/doc/profiling.Rmd new file mode 100644 index 000000000..1f9eb0dc5 --- /dev/null +++ b/doc/profiling.Rmd @@ -0,0 +1,233 @@ +--- +title: "Profiling Stan programs with CmdStanR" +author: "Rok Češnovar, Jonah Gabry and Ben Bales" +output: + rmarkdown::html_vignette: + toc: true + toc_depth: 4 +params: + EVAL: !r identical(Sys.getenv("NOT_CRAN"), "true") +vignette: > + %\VignetteIndexEntry{Profiling Stan programs with CmdStanR} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r child="children/_settings-knitr.Rmd"} +``` + +## Introduction + +This vignette demonstrates how to use the new profiling functionality +introduced in CmdStan 2.26.0. + +Profiling identifies which parts of a Stan program are taking the longest time +to run and is therefore a useful guide when working on optimizing the +performance of a model. + +However, be aware that the statistical assumptions that go into a model are +the most important factors in overall model performance. It is often not +possible to make up for model problems with just brute force computation. For +ideas on how to address performance of your model from a statistical +perspective, see Gelman (2020). + +```{r library, message=FALSE} +library(cmdstanr) +check_cmdstan_toolchain(fix = TRUE, quiet = TRUE) +``` + +## Adding profiling statements to a Stan program + +Consider a simple logistic regression with parameters `alpha` and `beta`, +covariates `X`, and outcome `y`. + +``` +data { + int k; + int n; + matrix[n, k] X; + array[n] int y; +} +parameters { + vector[k] beta; + real alpha; +} +model { + beta ~ std_normal(); + alpha ~ std_normal(); + + y ~ bernoulli_logit(X * beta + alpha); +} +``` + +A simple question is how much time do the prior calculations take compared +against the likelihood? To answer this we surround the prior and likelihood +calculations with `profile` statements. + +``` +profile("priors") { + target += std_normal_lpdf(beta); + target += std_normal_lpdf(alpha); +} +profile("likelihood") { + target += bernoulli_logit_lpmf(y | X * beta + alpha); +} +``` + +In general we recommend using a separate `.stan` file, but for convenience in +this vignette we'll write the Stan program as a string and use +`write_stan_file()` to write it to a temporary file. + +```{r profiling_bernoulli_logit.stan} +profiling_bernoulli_logit <- write_stan_file(' +data { + int k; + int n; + matrix[n, k] X; + array[n] int y; +} +parameters { + vector[k] beta; + real alpha; +} +model { + profile("priors") { + target += std_normal_lpdf(beta); + target += std_normal_lpdf(alpha); + } + profile("likelihood") { + target += bernoulli_logit_lpmf(y | X * beta + alpha); + } +} +') +``` + +We can then run the model as usual and Stan will collect the profiling +information for any sections with `profile` statements. + +```{r fit-model, message=FALSE, results='hide'} +# Compile the model +model <- cmdstan_model(profiling_bernoulli_logit) + +# Generate some fake data +n <- 1000 +k <- 20 +X <- matrix(rnorm(n * k), ncol = k) + +y <- 3 * X[,1] - 2 * X[,2] + 1 +p <- runif(n) +y <- ifelse(p < (1 / (1 + exp(-y))), 1, 0) +stan_data <- list(k = ncol(X), n = nrow(X), y = y, X = X) + +# Run one chain of the model +fit <- model$sample(data = stan_data, chains = 1) +``` + +## Accessing the profiling information from R + +The raw profiling information can then be accessed with the `$profiles()` +method, which returns a list containing one data frame per chain (profiles +across multiple chains are not automatically aggregated). Details on the column +names are available in the +[CmdStan documentation](https://mc-stan.org/docs/2_26/cmdstan-guide/stan-csv.html#profiling-csv-output-file). + +```{r profiles} +fit$profiles() +``` + +The `total_time` column is the total time spent inside a given profile +statement. It is clear that the vast majority of time is spent in the likelihood +function. + +## Comparing to a faster version of the model + +Stan's specialized glm functions can be used to make models like this faster. In +this case the likelihood can be replaced with + +``` +target += bernoulli_logit_glm_lpmf(y | X, alpha, beta); +``` + +We'll keep the same `profile()` statements so that the profiling information for +the new model is collected automatically just like for the previous one. + +```{r profiling_bernoulli_logit_glm.stan} +profiling_bernoulli_logit_glm <- write_stan_file(' +data { + int k; + int n; + matrix[n, k] X; + array[n] int y; +} +parameters { + vector[k] beta; + real alpha; +} +model { + profile("priors") { + target += std_normal_lpdf(beta); + target += std_normal_lpdf(alpha); + } + profile("likelihood") { + target += bernoulli_logit_glm_lpmf(y | X, alpha, beta); + } +} +') +``` + +```{r fit-model-glm, message=FALSE, results='hide'} +model_glm <- cmdstan_model(profiling_bernoulli_logit_glm) +fit_glm <- model_glm$sample(data = stan_data, chains = 1) +``` + +```{r profiles-glm} +fit_glm$profiles() +``` + +We can see from the `total_time` column that this is much faster than the +previous model. + +## Per-gradient timings, and memory usage + +The other columns of the profiling output are documented in the +[CmdStan documentation](https://mc-stan.org/docs/2_26/cmdstan-guide/stan-csv.html#profiling-csv-output-file). + +The timing numbers are broken down by forward pass and reverse pass, and the +`chain_stack` and `no_chain_stack` columns contain information about how many +autodiff variables were saved in the process of performing a calculation. + +These numbers are all totals -- times are the total times over the whole +calculation, and `chain_stack` counts are similarly the total counts of autodiff +variables used over the whole calculation. It is often convenient to have +per-gradient calculations (which will be more stable across runs with different +seeds). To compute these, use the `autodiff_calls` column. + +```{r per-gradient} +profile_chain_1 <- fit$profiles()[[1]] +per_gradient_timing <- profile_chain_1$total_time/profile_chain_1$autodiff_calls +print(per_gradient_timing) # two elements for the two profile statements in the model +``` + +## Accessing and saving the profile files + +After sampling (or optimization or variational inference) finishes, CmdStan stores +the profiling data in CSV files in a temporary location. +The paths of the profiling CSV files can be retrieved using `$profile_files()`. + +```{r profile_files} +fit$profile_files() +``` + +These can be saved to a more permanent location with the `$save_profile_files()` +method. + +```{r save_profile_files, eval=FALSE} +# see ?save_profile_files for info on optional arguments +fit$save_profile_files(dir = "path/to/directory") +``` + +# References + +Gelman, Andrew, Aki Vehtari, Daniel Simpson, Charles C. Margossian, Bob +Carpenter, Yuling Yao, Lauren Kennedy, Jonah Gabry, Paul-Christian Bürkner, and +Martin Modrák. 2020. "Bayesian Workflow." https://arxiv.org/abs/2011.01808. diff --git a/doc/profiling.html b/doc/profiling.html new file mode 100644 index 000000000..6a95413be --- /dev/null +++ b/doc/profiling.html @@ -0,0 +1,552 @@ + + + + + + + + + + + + + + + +Profiling Stan programs with CmdStanR + + + + + + + + + + + + + + + + + + + + + + + + + + +

Profiling Stan programs with CmdStanR

+

Rok Češnovar, Jonah Gabry and Ben Bales

+ + + + +
+

Introduction

+

This vignette demonstrates how to use the new profiling functionality +introduced in CmdStan 2.26.0.

+

Profiling identifies which parts of a Stan program are taking the +longest time to run and is therefore a useful guide when working on +optimizing the performance of a model.

+

However, be aware that the statistical assumptions that go into a +model are the most important factors in overall model performance. It is +often not possible to make up for model problems with just brute force +computation. For ideas on how to address performance of your model from +a statistical perspective, see Gelman (2020).

+
library(cmdstanr)
+check_cmdstan_toolchain(fix = TRUE, quiet = TRUE)
+
+
+

Adding profiling statements to a Stan program

+

Consider a simple logistic regression with parameters +alpha and beta, covariates X, and +outcome y.

+
data {
+  int<lower=1> k;
+  int<lower=0> n;
+  matrix[n, k] X;
+  array[n] int y;
+}
+parameters {
+  vector[k] beta;
+  real alpha;
+}
+model {
+  beta ~ std_normal();
+  alpha ~ std_normal();
+
+  y ~ bernoulli_logit(X * beta + alpha);
+}
+

A simple question is how much time do the prior calculations take +compared against the likelihood? To answer this we surround the prior +and likelihood calculations with profile statements.

+
profile("priors") {
+  target += std_normal_lpdf(beta);
+  target += std_normal_lpdf(alpha);
+}
+profile("likelihood") {
+  target += bernoulli_logit_lpmf(y | X * beta + alpha);
+}
+

In general we recommend using a separate .stan file, but +for convenience in this vignette we’ll write the Stan program as a +string and use write_stan_file() to write it to a temporary +file.

+
profiling_bernoulli_logit <- write_stan_file('
+data {
+  int<lower=1> k;
+  int<lower=0> n;
+  matrix[n, k] X;
+  array[n] int y;
+}
+parameters {
+  vector[k] beta;
+  real alpha;
+}
+model {
+  profile("priors") {
+    target += std_normal_lpdf(beta);
+    target += std_normal_lpdf(alpha);
+  }
+  profile("likelihood") {
+    target += bernoulli_logit_lpmf(y | X * beta + alpha);
+  }
+}
+')
+

We can then run the model as usual and Stan will collect the +profiling information for any sections with profile +statements.

+
# Compile the model
+model <- cmdstan_model(profiling_bernoulli_logit)
+
+# Generate some fake data
+n <- 1000
+k <- 20
+X <- matrix(rnorm(n * k), ncol = k)
+
+y <- 3 * X[,1] - 2 * X[,2] + 1
+p <- runif(n)
+y <- ifelse(p < (1 / (1 + exp(-y))), 1, 0)
+stan_data <- list(k = ncol(X), n = nrow(X), y = y, X = X)
+
+# Run one chain of the model
+fit <- model$sample(data = stan_data, chains = 1)
+
+
+

Accessing the profiling information from R

+

The raw profiling information can then be accessed with the +$profiles() method, which returns a list containing one +data frame per chain (profiles across multiple chains are not +automatically aggregated). Details on the column names are available in +the CmdStan +documentation.

+
fit$profiles()
+

The total_time column is the total time spent inside a +given profile statement. It is clear that the vast majority of time is +spent in the likelihood function.

+
+
+

Comparing to a faster version of the model

+

Stan’s specialized glm functions can be used to make models like this +faster. In this case the likelihood can be replaced with

+
target += bernoulli_logit_glm_lpmf(y | X, alpha, beta);
+

We’ll keep the same profile() statements so that the +profiling information for the new model is collected automatically just +like for the previous one.

+
profiling_bernoulli_logit_glm <- write_stan_file('
+data {
+  int<lower=1> k;
+  int<lower=0> n;
+  matrix[n, k] X;
+  array[n] int y;
+}
+parameters {
+  vector[k] beta;
+  real alpha;
+}
+model {
+  profile("priors") {
+    target += std_normal_lpdf(beta);
+    target += std_normal_lpdf(alpha);
+  }
+  profile("likelihood") {
+    target += bernoulli_logit_glm_lpmf(y | X, alpha, beta);
+  }
+}
+')
+
model_glm <- cmdstan_model(profiling_bernoulli_logit_glm)
+fit_glm <- model_glm$sample(data = stan_data, chains = 1)
+
fit_glm$profiles()
+

We can see from the total_time column that this is much +faster than the previous model.

+
+
+

Per-gradient timings, and memory usage

+

The other columns of the profiling output are documented in the CmdStan +documentation.

+

The timing numbers are broken down by forward pass and reverse pass, +and the chain_stack and no_chain_stack columns +contain information about how many autodiff variables were saved in the +process of performing a calculation.

+

These numbers are all totals – times are the total times over the +whole calculation, and chain_stack counts are similarly the +total counts of autodiff variables used over the whole calculation. It +is often convenient to have per-gradient calculations (which will be +more stable across runs with different seeds). To compute these, use the +autodiff_calls column.

+
profile_chain_1 <- fit$profiles()[[1]]
+per_gradient_timing <- profile_chain_1$total_time/profile_chain_1$autodiff_calls
+print(per_gradient_timing) # two elements for the two profile statements in the model
+
+
+

Accessing and saving the profile files

+

After sampling (or optimization or variational inference) finishes, +CmdStan stores the profiling data in CSV files in a temporary location. +The paths of the profiling CSV files can be retrieved using +$profile_files().

+
fit$profile_files()
+

These can be saved to a more permanent location with the +$save_profile_files() method.

+
# see ?save_profile_files for info on optional arguments
+fit$save_profile_files(dir = "path/to/directory")
+
+
+

References

+

Gelman, Andrew, Aki Vehtari, Daniel Simpson, Charles C. Margossian, +Bob Carpenter, Yuling Yao, Lauren Kennedy, Jonah Gabry, Paul-Christian +Bürkner, and Martin Modrák. 2020. “Bayesian Workflow.” https://arxiv.org/abs/2011.01808.

+
+ + + + + + + + + + + diff --git a/doc/r-markdown.R b/doc/r-markdown.R new file mode 100644 index 000000000..77fa734c7 --- /dev/null +++ b/doc/r-markdown.R @@ -0,0 +1,28 @@ +params <- +list(EVAL = FALSE) + +## ----settings-knitr, include = FALSE------------------------------------------ +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval = if (isTRUE(exists("params"))) params$EVAL else FALSE +) + +## ----register-engine, message=FALSE------------------------------------------- +# library(cmdstanr) +# register_knitr_engine(override = TRUE) + +## ----print-ex1---------------------------------------------------------------- +# ex1$print() + +## ----fit-ex1------------------------------------------------------------------ +# fit <- ex1$sample( +# refresh = 0, +# seed = 42L +# ) +# +# print(fit) + +## ----register-engine-no-override---------------------------------------------- +# register_knitr_engine(override = FALSE) + diff --git a/doc/r-markdown.Rmd b/doc/r-markdown.Rmd new file mode 100644 index 000000000..ec03831e0 --- /dev/null +++ b/doc/r-markdown.Rmd @@ -0,0 +1,157 @@ +--- +title: "R Markdown CmdStan Engine" +author: "Mikhail Popov" +output: + rmarkdown::html_vignette: + toc: true +params: + EVAL: !r identical(Sys.getenv("NOT_CRAN"), "true") +vignette: > + %\VignetteIndexEntry{R Markdown CmdStan Engine} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r settings-knitr, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval = if (isTRUE(exists("params"))) params$EVAL else FALSE +) +``` + +R Markdown supports a variety of languages through the use of knitr language +engines. Where users wish to write Stan programs as chunks directly in R Markdown documents there are three options: + +1. the user wishes all the Stan chunks in the R Markdown document to be processed using RStan; +2. all Stan chunks are to be processed using CmdStanR; and. +3. some chunks are to be processed by RStan and some by CmdStanR. + +Behind the scenes in each option, the engine compiles the model code in each +chunk and creates an object that provides methods to run the model: a +`stanmodel` if Rstan is being used, or a `CmdStanModel` in the CmdStanR case. +This model object is assigned to a variable with the name given by the +`output.var` chunk option. + +## Option 1: Using RStan for all chunks + +This is the default option. In that case we can write, for example: + +````{verbatim} +```{stan, output.var="model"} +// Stan model code +``` + +```{r} +rstan::sampling(model) +``` +```` + +## Option 2: Using CmdStanR for all chunks + +If CmdStanR is being used a replacement engine needs to be registered along the following lines: + +```{r register-engine, message=FALSE} +library(cmdstanr) +register_knitr_engine(override = TRUE) +``` + +This overrides knitr's built-in `stan` engine so that all `stan` +chunks are processed with CmdStanR, not RStan. Of course, this also means that +the variable specified by `output.var` will no longer be a `stanmodel` object, +but instead a `CmdStanModel` object, so the example code above would look like this: + +````{verbatim} +```{stan, output.var="model"} +// Stan model code +``` + +```{r} +model$sample() +``` +```` + +## Example + +```{stan ex1, output.var="ex1"} +// This stan chunk results in a CmdStanModel object called "ex1" +parameters { + array[2] real y; +} +model { + y[1] ~ normal(0, 1); + y[2] ~ double_exponential(0, 2); +} +``` + +```{r print-ex1} +ex1$print() +``` + +```{r fit-ex1} +fit <- ex1$sample( + refresh = 0, + seed = 42L +) + +print(fit) +``` + + +## Option 3: Using both RStan and CmdStanR in the same R Markdown document + +While the default behavior is to override the built-in `stan` engine because the +assumption is that the user is probably not using both RStan and CmdStanR in the +same document or project, the option to use both exists. When registering +CmdStanR's knitr engine, set `override = FALSE` to register the engine as a +`cmdstan` engine: + +```{r register-engine-no-override} +register_knitr_engine(override = FALSE) +``` + +This will cause `stan` chunks to be processed by knitr's built-in, RStan-based +engine and only use CmdStanR's knitr engine for `cmdstan` chunks: + +````{verbatim} +```{stan, output.var="model_obj1"} +// Results in a stanmodel object from RStan +``` + +```{r} +rstan::sampling(model_obj1) +``` + +```{cmdstan, output.var="model_obj2"} +// Results in a CmdStanModel object from CmdStanR +``` + +```{r} +model_obj2$sample() +``` +```` + + +## Caching chunks + +Use `cache=TRUE` chunk option to avoid re-compiling the Stan model code every +time the R Markdown is knit/rendered. + +You can find the Stan model file and the compiled executable in the document's +cache directory. + + + +## Running interactively + +When running chunks interactively in RStudio (e.g. when using +[R Notebooks](https://bookdown.org/yihui/rmarkdown/notebook.html)), it has been +observed that the built-in, RStan-based engine is used for `stan` chunks even +when CmdStanR's engine has been registered in the session as the engine for +`stan`. As a workaround, when running chunks *interactively*, it is recommended +to use the `override = FALSE` option and change `stan` chunks to be `cmdstan` +chunks. + +Do not worry: if the template you use supports syntax highlighting for the Stan +language, that syntax highlighting will be applied to `cmdstan` chunks when the +document is knit/rendered. diff --git a/doc/r-markdown.html b/doc/r-markdown.html new file mode 100644 index 000000000..b0444ef3e --- /dev/null +++ b/doc/r-markdown.html @@ -0,0 +1,493 @@ + + + + + + + + + + + + + + + +R Markdown CmdStan Engine + + + + + + + + + + + + + + + + + + + + + + + + + + +

R Markdown CmdStan Engine

+

Mikhail Popov

+ + + + +

R Markdown supports a variety of languages through the use of knitr +language engines. Where users wish to write Stan programs as chunks +directly in R Markdown documents there are three options:

+
    +
  1. the user wishes all the Stan chunks in the R Markdown document to be +processed using RStan;
    +
  2. +
  3. all Stan chunks are to be processed using CmdStanR; and.
    +
  4. +
  5. some chunks are to be processed by RStan and some by CmdStanR.
  6. +
+

Behind the scenes in each option, the engine compiles the model code +in each chunk and creates an object that provides methods to run the +model: a stanmodel if Rstan is being used, or a +CmdStanModel in the CmdStanR case. This model object is +assigned to a variable with the name given by the +output.var chunk option.

+
+

Option 1: Using RStan for all chunks

+

This is the default option. In that case we can write, for +example:

+
```{stan, output.var="model"}
+// Stan model code
+```
+
+```{r}
+rstan::sampling(model)
+```
+
+
+

Option 2: Using CmdStanR for all chunks

+

If CmdStanR is being used a replacement engine needs to be registered +along the following lines:

+
library(cmdstanr)
+register_knitr_engine(override = TRUE)
+

This overrides knitr’s built-in stan engine so that all +stan chunks are processed with CmdStanR, not RStan. Of +course, this also means that the variable specified by +output.var will no longer be a stanmodel +object, but instead a CmdStanModel object, so the example +code above would look like this:

+
```{stan, output.var="model"}
+// Stan model code
+```
+
+```{r}
+model$sample()
+```
+
+
+

Example

+
// This stan chunk results in a CmdStanModel object called "ex1"
+parameters {
+  array[2] real y;
+}
+model {
+  y[1] ~ normal(0, 1);
+  y[2] ~ double_exponential(0, 2);
+}
+
ex1$print()
+
fit <- ex1$sample(
+  refresh = 0,
+  seed = 42L
+)
+
+print(fit)
+
+
+

Option 3: Using both RStan and CmdStanR in the same R Markdown +document

+

While the default behavior is to override the built-in +stan engine because the assumption is that the user is +probably not using both RStan and CmdStanR in the same document or +project, the option to use both exists. When registering CmdStanR’s +knitr engine, set override = FALSE to register the engine +as a cmdstan engine:

+
register_knitr_engine(override = FALSE)
+

This will cause stan chunks to be processed by knitr’s +built-in, RStan-based engine and only use CmdStanR’s knitr engine for +cmdstan chunks:

+
```{stan, output.var="model_obj1"}
+// Results in a stanmodel object from RStan
+```
+
+```{r}
+rstan::sampling(model_obj1)
+```
+
+```{cmdstan, output.var="model_obj2"}
+// Results in a CmdStanModel object from CmdStanR
+```
+
+```{r}
+model_obj2$sample()
+```
+
+
+

Caching chunks

+

Use cache=TRUE chunk option to avoid re-compiling the +Stan model code every time the R Markdown is knit/rendered.

+

You can find the Stan model file and the compiled executable in the +document’s cache directory.

+
+
+

Running interactively

+

When running chunks interactively in RStudio (e.g. when using R +Notebooks), it has been observed that the built-in, RStan-based +engine is used for stan chunks even when CmdStanR’s engine +has been registered in the session as the engine for stan. +As a workaround, when running chunks interactively, it is +recommended to use the override = FALSE option and change +stan chunks to be cmdstan chunks.

+

Do not worry: if the template you use supports syntax highlighting +for the Stan language, that syntax highlighting will be applied to +cmdstan chunks when the document is knit/rendered.

+
+ + + + + + + + + + +