diff --git a/doc/cmdstanr-internals.R b/doc/cmdstanr-internals.R deleted file mode 100644 index 297b14e2..00000000 --- a/doc/cmdstanr-internals.R +++ /dev/null @@ -1,214 +0,0 @@ -params <- -list(EVAL = FALSE) - -## ----settings-knitr, include=FALSE-------------------------------------------- -stopifnot(require(knitr)) -opts_chunk$set( - # collapse = TRUE, - dev = "png", - dpi = 150, - fig.asp = 0.618, - fig.width = 5, - out.width = "60%", - fig.align = "center", - comment = NA, - eval = if (isTRUE(exists("params"))) params$EVAL else FALSE -) - -## ----setup, message=FALSE----------------------------------------------------- -# library(cmdstanr) -# check_cmdstan_toolchain(fix = TRUE, quiet = TRUE) - -## ----start-clean, include=FALSE----------------------------------------------- -# exe <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli") -# unlink(exe) - -## ----compile------------------------------------------------------------------ -# stan_file <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli.stan") -# mod <- cmdstan_model(stan_file) -# mod$print() -# mod$stan_file() -# mod$exe_file() - -## ----already-compiled--------------------------------------------------------- -# mod <- cmdstan_model(stan_file) - -## ----compile-options, eval=FALSE---------------------------------------------- -# mod <- cmdstan_model( -# stan_file, -# force_recompile = TRUE, -# include_paths = "paths/to/directories/with/included/files", -# cpp_options = list(stan_threads = TRUE, STANC2 = TRUE) -# ) - -## ----compile-method----------------------------------------------------------- -# unlink(mod$exe_file()) -# mod <- cmdstan_model(stan_file, compile = FALSE) -# mod$exe_file() # not yet created -# mod$compile() -# mod$exe_file() - -## ----stan_file_pedantic------------------------------------------------------- -# stan_file_pedantic <- write_stan_file(" -# data { -# int N; -# array[N] int y; -# } -# parameters { -# // should have but omitting to demonstrate pedantic mode -# real lambda; -# } -# model { -# y ~ poisson(lambda); -# } -# ") - -## ----pedantic-compile, collapse = TRUE---------------------------------------- -# mod_pedantic <- cmdstan_model(stan_file_pedantic, pedantic = TRUE) - -## ----pedantic-check_syntax, collapse=TRUE------------------------------------- -# mod_pedantic$check_syntax(pedantic = TRUE) - -## ----pedantic-check_syntax-2, collapse=TRUE----------------------------------- -# file.remove(mod_pedantic$exe_file()) # delete compiled executable -# rm(mod_pedantic) -# -# mod_pedantic <- cmdstan_model(stan_file_pedantic, compile = FALSE) -# mod_pedantic$check_syntax(pedantic = TRUE) - -## ----stan_file_variables------------------------------------------------------ -# stan_file_variables <- write_stan_file(" -# data { -# int J; -# vector[J] sigma; -# vector[J] y; -# } -# parameters { -# real mu; -# real tau; -# vector[J] theta_raw; -# } -# transformed parameters { -# vector[J] theta = mu + tau * theta_raw; -# } -# model { -# target += normal_lpdf(tau | 0, 10); -# target += normal_lpdf(mu | 0, 10); -# target += normal_lpdf(theta_raw | 0, 1); -# target += normal_lpdf(y | theta, sigma); -# } -# ") -# mod_v <- cmdstan_model(stan_file_variables) -# variables <- mod_v$variables() - -## ----variables-list-names----------------------------------------------------- -# names(variables) -# names(variables$data) -# names(variables$parameters) -# names(variables$transformed_parameters) -# names(variables$generated_quantities) - -## ----variable-type-dims------------------------------------------------------- -# variables$data$J -# variables$data$sigma -# variables$parameters$tau -# variables$transformed_parameters$theta - -## ----compile-with-dir, eval = FALSE------------------------------------------- -# mod <- cmdstan_model(stan_file, dir = "path/to/directory/for/executable") - -## ----print-program-again------------------------------------------------------ -# mod$print() - -## ----data-list, eval=FALSE---------------------------------------------------- -# # data block has 'N' and 'y' -# data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) -# fit <- mod$sample(data = data_list) - -## ----write_stan_json---------------------------------------------------------- -# data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) -# json_file <- tempfile(fileext = ".json") -# write_stan_json(data_list, json_file) -# cat(readLines(json_file), sep = "\n") - -## ----data-json, eval=FALSE---------------------------------------------------- -# fit <- mod$sample(data = json_file) - -## ----data-rdump, eval=FALSE--------------------------------------------------- -# rdump_file <- tempfile(fileext = ".data.R") -# rstan::stan_rdump(names(data_list), file = rdump_file, envir = list2env(data_list)) -# cat(readLines(rdump_file), sep = "\n") -# fit <- mod$sample(data = rdump_file) - -## ----sample-tempdir, results = "hide"----------------------------------------- -# data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) -# fit <- mod$sample(data = data_list) - -## ----output_files------------------------------------------------------------- -# fit$output_files() - -## ----gc----------------------------------------------------------------------- -# files <- fit$output_files() -# file.exists(files) -# -# rm(fit) -# gc() -# -# file.exists(files) - -## ----save_output_files, eval=FALSE-------------------------------------------- -# # see ?save_output_files for info on optional arguments -# fit$save_output_files(dir = "path/to/directory") - -## ----output_dir, eval = FALSE------------------------------------------------- -# fit <- mod$sample( -# data = data_list, -# output_dir = "path/to/directory" -# ) - -## ----refit, include=FALSE----------------------------------------------------- -# fit <- mod$sample(data = data_list) - -## ----csv-not-read------------------------------------------------------------- -# str(fit) - -## ----for-csv-reading---------------------------------------------------------- -# draws <- fit$draws() # force CSVs to be read into R -# str(fit) - -## ----read_cmdstan_csv--------------------------------------------------------- -# # see ?read_cmdstan_csv for info on optional arguments controlling -# # what information is read in -# csv_contents <- read_cmdstan_csv(fit$output_files()) -# str(csv_contents) - -## ----as_cmdstan_fit----------------------------------------------------------- -# fit2 <- as_cmdstan_fit(fit$output_files()) - -## ----save_latent_dynamics, results = "hide"----------------------------------- -# fit <- mod$sample(data = data_list, save_latent_dynamics = TRUE) - -## ----read-latent-dynamics----------------------------------------------------- -# fit$latent_dynamics_files() -# -# # read one of the files in -# x <- utils::read.csv(fit$latent_dynamics_files()[1], comment.char = "#") -# head(x) - -## ----explore-latent-dynamics-------------------------------------------------- -# head(x[, c("theta", "p_theta", "g_theta")]) - -## ----verbose-mode------------------------------------------------------------- -# options("cmdstanr_verbose"=TRUE) -# -# mod <- cmdstan_model(stan_file, force_recompile = TRUE) -# fit <- mod$sample( -# data = data_list, -# chains = 1, -# iter_warmup = 100, -# iter_sampling = 100 -# ) - -## ----include=FALSE------------------------------------------------------------ -# options("cmdstanr_verbose" = FALSE) - diff --git a/doc/cmdstanr-internals.Rmd b/doc/cmdstanr-internals.Rmd deleted file mode 100644 index 96ef06ea..00000000 --- a/doc/cmdstanr-internals.Rmd +++ /dev/null @@ -1,487 +0,0 @@ ---- -title: "How does CmdStanR work?" -author: "Jonah Gabry and Rok Češnovar" -output: - rmarkdown::html_vignette: - toc: true - toc_depth: 4 -params: - EVAL: !r identical(Sys.getenv("NOT_CRAN"), "true") -vignette: > - %\VignetteIndexEntry{How does CmdStanR work?} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r child="children/_settings-knitr.Rmd"} -``` - -## Introduction - -This vignette is intended to be read after the [_Getting started with CmdStanR_](http://mc-stan.org/cmdstanr/articles/cmdstanr.html) -vignette. Please read that first for important background. In this document we -provide additional details about compiling models, passing in data, and how -CmdStan output is saved and read back into R. - -We will only use the `$sample()` method in examples, but all model fitting -methods work in a similar way under the hood. - -```{r setup, message=FALSE} -library(cmdstanr) -check_cmdstan_toolchain(fix = TRUE, quiet = TRUE) -``` - -## Compilation - -### Immediate compilation - -The `cmdstan_model()` function creates a new `CmdStanModel` object. The -`CmdStanModel` object stores the path to a Stan program as well as the -path to a compiled executable. - -```{r start-clean, include=FALSE} -exe <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli") -unlink(exe) -``` - -```{r compile} -stan_file <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli.stan") -mod <- cmdstan_model(stan_file) -mod$print() -mod$stan_file() -mod$exe_file() -``` - -Subsequently, if you create a `CmdStanModel` object from the same Stan file -then compilation will be skipped (assuming the file hasn't changed). - -```{r already-compiled} -mod <- cmdstan_model(stan_file) -``` - -Internally, `cmdstan_model()` first creates the `CmdStanModel` object from -just the Stan file and then calls its [`$compile()`](http://mc-stan.org/cmdstanr/reference/model-method-compile.html) -method. Optional arguments to the `$compile()` method can be passed via `...`. - -```{r compile-options, eval=FALSE} -mod <- cmdstan_model( - stan_file, - force_recompile = TRUE, - include_paths = "paths/to/directories/with/included/files", - cpp_options = list(stan_threads = TRUE, STANC2 = TRUE) -) -``` - - - -### Delayed compilation - -It is also possible to delay compilation when creating the `CmdStanModel` object -by specifying `compile=FALSE` and then later calling the `$compile()` method -directly. - -```{r compile-method} -unlink(mod$exe_file()) -mod <- cmdstan_model(stan_file, compile = FALSE) -mod$exe_file() # not yet created -mod$compile() -mod$exe_file() -``` - -### Pedantic check - -If you are using CmdStan version 2.24 or later and CmdStanR version 0.2.1 or -later, you can run a pedantic check for your model. CmdStanR will always check -that your Stan program does not contain any invalid syntax but with pedantic -mode enabled the check will also warn you about other potential issues in your -model, for example: - -- Distribution usages issues: distribution arguments do not match the -distribution specification, or some specific distribution is used in an -inadvisable way. -- Unused parameter: a parameter is defined but does not contribute to target. -- Large or small constant in a distribution: very large or very small constants -are used as distribution arguments. -- Control flow depends on a parameter: branching control flow (like if/else) -depends on a parameter value. -- Parameter has multiple twiddles: a parameter is on the left-hand side of -multiple twiddles (i.e., multiple `~` symbols). -- Parameter has zero or multiple priors: a parameter has zero or more than one -prior distribution. -- Variable is used before assignment: a variable is used before being assigned a -value. -- Strict or nonsensical parameter bounds: a parameter is given questionable -bounds. - -For the latest information on the checks performed in pedantic mode see the -[Pedantic mode chapter](https://mc-stan.org/docs/stan-users-guide/pedantic-mode.html) -in the Stan Reference Manual. - -Pedantic mode is available when compiling the model or when using the separate -`$check_syntax()` method of a `CmdStanModel` object. Internally this corresponds -to setting the `stanc` (Stan transpiler) option `warn-pedantic`. Here we -demonstrate pedantic mode with a Stan program that is syntactically correct but -is missing a lower bound and a prior for a parameter. - -```{r stan_file_pedantic} -stan_file_pedantic <- write_stan_file(" -data { - int N; - array[N] int y; -} -parameters { - // should have but omitting to demonstrate pedantic mode - real lambda; -} -model { - y ~ poisson(lambda); -} -") -``` - -To turn on pedantic mode at compile time you can set `pedantic=TRUE` in -the call to `cmdstan_model()` (or when calling the `$compile()` method directly -if using the delayed compilation approach described above). - -```{r pedantic-compile, collapse = TRUE} -mod_pedantic <- cmdstan_model(stan_file_pedantic, pedantic = TRUE) -``` - -To turn on pedantic mode separately from compilation use the `pedantic` argument -to the `$check_syntax()` method. - -```{r pedantic-check_syntax, collapse=TRUE} -mod_pedantic$check_syntax(pedantic = TRUE) -``` - -Using `pedantic=TRUE` via the `$check_syntax()` method also has the advantage -that it can be used even if the model hasn't been compiled yet. This can be -helpful because the pedantic and syntax checks themselves are much faster than -compilation. - -```{r pedantic-check_syntax-2, collapse=TRUE} -file.remove(mod_pedantic$exe_file()) # delete compiled executable -rm(mod_pedantic) - -mod_pedantic <- cmdstan_model(stan_file_pedantic, compile = FALSE) -mod_pedantic$check_syntax(pedantic = TRUE) -``` - -### Stan model variables - -If using CmdStan 2.27 or newer, you can obtain the names, types -and dimensions of the data, parameters, transformed parameters -and generated quantities variables of a Stan model using the -`$variables()` method of the `CmdStanModel` object. - -```{r stan_file_variables} -stan_file_variables <- write_stan_file(" -data { - int J; - vector[J] sigma; - vector[J] y; -} -parameters { - real mu; - real tau; - vector[J] theta_raw; -} -transformed parameters { - vector[J] theta = mu + tau * theta_raw; -} -model { - target += normal_lpdf(tau | 0, 10); - target += normal_lpdf(mu | 0, 10); - target += normal_lpdf(theta_raw | 0, 1); - target += normal_lpdf(y | theta, sigma); -} -") -mod_v <- cmdstan_model(stan_file_variables) -variables <- mod_v$variables() -``` - -The `$variables()` method returns a list with `data`, `parameters`, -`transformed_parameters` and `generated_quantities` elements, each -corresponding to variables in their respective block of the program. Transformed -data variables are not listed as they are not used in the model's input -or output. - -```{r variables-list-names} -names(variables) -names(variables$data) -names(variables$parameters) -names(variables$transformed_parameters) -names(variables$generated_quantities) -``` - -Each variable is represented as a list containing the type -information (currently limited to `real` or `int`) and the number of dimensions. - -```{r variable-type-dims} -variables$data$J -variables$data$sigma -variables$parameters$tau -variables$transformed_parameters$theta -``` - -### Executable location - -By default, the executable is created in the same directory as the file -containing the Stan program. You can also specify a different location with the -`dir` argument. - -```{r compile-with-dir, eval = FALSE} -mod <- cmdstan_model(stan_file, dir = "path/to/directory/for/executable") -``` - -## Processing data - -There are three data formats that CmdStanR allows when fitting a model: - -* named list of R objects -* JSON file -* R dump file - -### Named list of R objects - -Like the RStan interface, CmdStanR accepts a named list of R objects where the -names correspond to variables declared in the data block of the Stan program. -In the Bernoulli model the data is `N`, the number of data points, and `y` -an integer array of observations. - -```{r print-program-again} -mod$print() -``` - -```{r data-list, eval=FALSE} -# data block has 'N' and 'y' -data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) -fit <- mod$sample(data = data_list) -``` - -Because CmdStan doesn't accept lists of R objects, CmdStanR will first write the -data to a temporary JSON file using `write_stan_json()`. This happens -internally, but it is also possible to call `write_stan_json()` directly. - -```{r write_stan_json} -data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) -json_file <- tempfile(fileext = ".json") -write_stan_json(data_list, json_file) -cat(readLines(json_file), sep = "\n") -``` - -### JSON file - -If you already have your data in a JSON file you can just pass that file -directly to CmdStanR instead of using a list of R objects. For example, we could -pass in the JSON file we created above using `write_stan_json()`: - -```{r data-json, eval=FALSE} -fit <- mod$sample(data = json_file) -``` - - -### R dump file - -Finally, it is also possible to use the R dump file format. This is *not* -recommended because CmdStan can process JSON faster than R dump, but CmdStanR -allows it because CmdStan will accept files created by `rstan::stan_rdump()`: - -```{r data-rdump, eval=FALSE} -rdump_file <- tempfile(fileext = ".data.R") -rstan::stan_rdump(names(data_list), file = rdump_file, envir = list2env(data_list)) -cat(readLines(rdump_file), sep = "\n") -fit <- mod$sample(data = rdump_file) -``` - - -## Writing CmdStan output to CSV - -### Default temporary files - -```{r sample-tempdir, results = "hide"} -data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) -fit <- mod$sample(data = data_list) -``` - -When fitting a model, the default behavior is to write the output from CmdStan -to CSV files in a temporary directory. - -```{r output_files} -fit$output_files() -``` - -These files will be lost if you end your R session or if you remove the -`fit` object and force (or wait for) garbage collection. - - -```{r gc} -files <- fit$output_files() -file.exists(files) - -rm(fit) -gc() - -file.exists(files) -``` - -### Non-temporary files - -To save these files to a non-temporary location there are two options. You -can either specify the `output_dir` argument to `mod$sample()` or use -`fit$save_output_files()` after fitting the model. - -```{r save_output_files, eval=FALSE} -# see ?save_output_files for info on optional arguments -fit$save_output_files(dir = "path/to/directory") -``` - -```{r output_dir, eval = FALSE} -fit <- mod$sample( - data = data_list, - output_dir = "path/to/directory" -) -``` - - -## Reading CmdStan output into R - -### Lazy CSV reading - -With the exception of some diagnostic information, the CSV files are not read -into R until their contents are requested by calling a method that requires them -(e.g., `fit$draws()`, `fit$summary()`, etc.). If we examine the structure of the -`fit` object, notice how the `Private` slot `draws_` is `NULL`, indicating that -the CSV files haven't yet been read into R. - -```{r refit, include=FALSE} -fit <- mod$sample(data = data_list) -``` -```{r csv-not-read} -str(fit) -``` - -After we call a method that requires the draws then if we reexamine the -structure of the object we will see that the `draws_` slot in `Private` -is no longer empty. - -```{r for-csv-reading} -draws <- fit$draws() # force CSVs to be read into R -str(fit) -``` - -For models with many parameters, transformed parameters, or generated -quantities, if only some are requested (e.g., by specifying the `variables` -argument to `fit$draws()`) then CmdStanR will only read in the requested -variables (unless they have already been read in). - -### read_cmdstan_csv() - -Internally, the `read_cmdstan_csv()` function is used to read the CmdStan CSV -files into R. This function is exposed to users, so you can also call it -directly. - -```{r read_cmdstan_csv} -# see ?read_cmdstan_csv for info on optional arguments controlling -# what information is read in -csv_contents <- read_cmdstan_csv(fit$output_files()) -str(csv_contents) -``` - -### as_cmdstan_fit() - -If you need to manually create fitted model objects from CmdStan CSV files use -`as_cmdstan_fit()`. - -```{r as_cmdstan_fit} -fit2 <- as_cmdstan_fit(fit$output_files()) -``` - -This is pointless in our case since we have the original `fit` object, but this -function can be used to create fitted model objects (`CmdStanMCMC`, -`CmdStanMLE`, etc.) from any CmdStan CSV files. - -### Saving and accessing advanced algorithm info (latent dynamics) - -If `save_latent_dynamics` is set to `TRUE` when running the `$sample()` method -then additional CSV files are created (one per chain) that provide access to -quantities used under the hood by Stan's implementation of dynamic Hamiltonian -Monte Carlo. - -CmdStanR does not yet provide a special method for processing these files but -they can be read into R using R's standard CSV reading functions. - - -```{r save_latent_dynamics, results = "hide"} -fit <- mod$sample(data = data_list, save_latent_dynamics = TRUE) -``` -```{r read-latent-dynamics} -fit$latent_dynamics_files() - -# read one of the files in -x <- utils::read.csv(fit$latent_dynamics_files()[1], comment.char = "#") -head(x) -``` - -The column `lp__` is also provided via `fit$draws()`, and the columns -`accept_stat__`, `stepsize__`, `treedepth__`, `n_leapfrog__`, `divergent__`, and -`energy__` are also provided by `fit$sampler_diagnostics()`, but there are -several columns unique to the latent dynamics file. - -```{r explore-latent-dynamics} -head(x[, c("theta", "p_theta", "g_theta")]) -``` - -Our model has a single parameter `theta` and the three columns above correspond -to `theta` in the _unconstrained_ space (`theta` on the constrained space is -accessed via `fit$draws()`), the auxiliary momentum `p_theta`, and the gradient -`g_theta`. In general, each of these three columns will exist for _every_ -parameter in the model. - - -## Developing using CmdStanR - -CmdStanR can of course be used for developing other packages that require compiling -and running Stan models as well as using new or custom Stan features available -through CmdStan. - -### Pre-compiled Stan models in R packages - -You may compile a Stan model at runtime (e.g. just before sampling), -or you may compile all the models inside the package file system in advance at installation time. -The latter avoids compilations at runtime, which matters in centrally managed R installations -where users should not compile their own software. - -To pre-compile all the models in a package, -you may create top-level scripts `configure` and `configure.win` -which run `cmdstan_model()` with `compile = TRUE` and save the compiled executables -somewhere inside the `inst/` folder of the package source. -The [`instantiate`](https://wlandau.github.io/instantiate/) package helps developers -configure packages this way, -and it documents other topics such as submitting to CRAN and administering CmdStan. -Kevin Ushey's [`configure`](https://github.com/kevinushey/configure) package helps -create and manage package configuration files in general. - - -### Troubleshooting and debugging - -When developing or testing new features it might be useful to have more -information on how CmdStan is called internally and to see more information -printed when compiling or running models. This can be enabled for an entire R -session by setting the option `"cmdstanr_verbose"` to `TRUE`. - -```{r verbose-mode} -options("cmdstanr_verbose"=TRUE) - -mod <- cmdstan_model(stan_file, force_recompile = TRUE) -fit <- mod$sample( - data = data_list, - chains = 1, - iter_warmup = 100, - iter_sampling = 100 -) -``` - -```{r include=FALSE} -options("cmdstanr_verbose" = FALSE) -``` diff --git a/doc/cmdstanr-internals.html b/doc/cmdstanr-internals.html deleted file mode 100644 index 4d13fcd1..00000000 --- a/doc/cmdstanr-internals.html +++ /dev/null @@ -1,783 +0,0 @@ - - - - - - - - - - - - - - - -How does CmdStanR work? - - - - - - - - - - - - - - - - - - - - - - - - - - -

How does CmdStanR work?

-

Jonah Gabry and Rok Češnovar

- - - - -
-

Introduction

-

This vignette is intended to be read after the Getting -started with CmdStanR vignette. Please read that first for -important background. In this document we provide additional details -about compiling models, passing in data, and how CmdStan output is saved -and read back into R.

-

We will only use the $sample() method in examples, but -all model fitting methods work in a similar way under the hood.

-
library(cmdstanr)
-check_cmdstan_toolchain(fix = TRUE, quiet = TRUE)
-
-
-

Compilation

-
-

Immediate compilation

-

The cmdstan_model() function creates a new -CmdStanModel object. The CmdStanModel object -stores the path to a Stan program as well as the path to a compiled -executable.

-
stan_file <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli.stan")
-mod <- cmdstan_model(stan_file)
-mod$print()
-mod$stan_file()
-mod$exe_file()
-

Subsequently, if you create a CmdStanModel object from -the same Stan file then compilation will be skipped (assuming the file -hasn’t changed).

-
mod <- cmdstan_model(stan_file)
-

Internally, cmdstan_model() first creates the -CmdStanModel object from just the Stan file and then calls -its $compile() -method. Optional arguments to the $compile() method can be -passed via ....

-
mod <- cmdstan_model(
-  stan_file,
-  force_recompile = TRUE,
-  include_paths = "paths/to/directories/with/included/files",
-  cpp_options = list(stan_threads = TRUE, STANC2 = TRUE)
-)
-
-
-

Delayed compilation

-

It is also possible to delay compilation when creating the -CmdStanModel object by specifying -compile=FALSE and then later calling the -$compile() method directly.

-
unlink(mod$exe_file())
-mod <- cmdstan_model(stan_file, compile = FALSE)
-mod$exe_file() # not yet created
-mod$compile()
-mod$exe_file()
-
-
-

Pedantic check

-

If you are using CmdStan version 2.24 or later and CmdStanR version -0.2.1 or later, you can run a pedantic check for your model. CmdStanR -will always check that your Stan program does not contain any invalid -syntax but with pedantic mode enabled the check will also warn you about -other potential issues in your model, for example:

-
    -
  • Distribution usages issues: distribution arguments do not match the -distribution specification, or some specific distribution is used in an -inadvisable way.
  • -
  • Unused parameter: a parameter is defined but does not contribute to -target.
  • -
  • Large or small constant in a distribution: very large or very small -constants are used as distribution arguments.
  • -
  • Control flow depends on a parameter: branching control flow (like -if/else) depends on a parameter value.
  • -
  • Parameter has multiple twiddles: a parameter is on the left-hand -side of multiple twiddles (i.e., multiple ~ symbols).
  • -
  • Parameter has zero or multiple priors: a parameter has zero or more -than one prior distribution.
  • -
  • Variable is used before assignment: a variable is used before being -assigned a value.
  • -
  • Strict or nonsensical parameter bounds: a parameter is given -questionable bounds.
  • -
-

For the latest information on the checks performed in pedantic mode -see the Pedantic -mode chapter in the Stan Reference Manual.

-

Pedantic mode is available when compiling the model or when using the -separate $check_syntax() method of a -CmdStanModel object. Internally this corresponds to setting -the stanc (Stan transpiler) option -warn-pedantic. Here we demonstrate pedantic mode with a -Stan program that is syntactically correct but is missing a lower bound -and a prior for a parameter.

-
stan_file_pedantic <- write_stan_file("
-data {
-  int N;
-  array[N] int y;
-}
-parameters {
-  // should have <lower=0> but omitting to demonstrate pedantic mode
-  real lambda;
-}
-model {
-  y ~ poisson(lambda);
-}
-")
-

To turn on pedantic mode at compile time you can set -pedantic=TRUE in the call to cmdstan_model() -(or when calling the $compile() method directly if using -the delayed compilation approach described above).

-
mod_pedantic <- cmdstan_model(stan_file_pedantic, pedantic = TRUE)
-

To turn on pedantic mode separately from compilation use the -pedantic argument to the $check_syntax() -method.

-
mod_pedantic$check_syntax(pedantic = TRUE)
-

Using pedantic=TRUE via the $check_syntax() -method also has the advantage that it can be used even if the model -hasn’t been compiled yet. This can be helpful because the pedantic and -syntax checks themselves are much faster than compilation.

-
file.remove(mod_pedantic$exe_file()) # delete compiled executable
-rm(mod_pedantic)
-
-mod_pedantic <- cmdstan_model(stan_file_pedantic, compile = FALSE)
-mod_pedantic$check_syntax(pedantic = TRUE)
-
-
-

Stan model variables

-

If using CmdStan 2.27 or newer, you can obtain the names, types and -dimensions of the data, parameters, transformed parameters and generated -quantities variables of a Stan model using the $variables() -method of the CmdStanModel object.

-
stan_file_variables <- write_stan_file("
-data {
-  int<lower=1> J;
-  vector<lower=0>[J] sigma;
-  vector[J] y;
-}
-parameters {
-  real mu;
-  real<lower=0> tau;
-  vector[J] theta_raw;
-}
-transformed parameters {
-  vector[J] theta = mu + tau * theta_raw;
-}
-model {
-  target += normal_lpdf(tau | 0, 10);
-  target += normal_lpdf(mu | 0, 10);
-  target += normal_lpdf(theta_raw | 0, 1);
-  target += normal_lpdf(y | theta, sigma);
-}
-")
-mod_v <- cmdstan_model(stan_file_variables)
-variables <- mod_v$variables()
-

The $variables() method returns a list with -data, parameters, -transformed_parameters and -generated_quantities elements, each corresponding to -variables in their respective block of the program. Transformed data -variables are not listed as they are not used in the model’s input or -output.

-
names(variables)
-names(variables$data)
-names(variables$parameters)
-names(variables$transformed_parameters)
-names(variables$generated_quantities)
-

Each variable is represented as a list containing the type -information (currently limited to real or int) -and the number of dimensions.

-
variables$data$J
-variables$data$sigma
-variables$parameters$tau
-variables$transformed_parameters$theta
-
-
-

Executable location

-

By default, the executable is created in the same directory as the -file containing the Stan program. You can also specify a different -location with the dir argument.

-
mod <- cmdstan_model(stan_file, dir = "path/to/directory/for/executable")
-
-
-
-

Processing data

-

There are three data formats that CmdStanR allows when fitting a -model:

-
    -
  • named list of R objects
  • -
  • JSON file
  • -
  • R dump file
  • -
-
-

Named list of R objects

-

Like the RStan interface, CmdStanR accepts a named list of R objects -where the names correspond to variables declared in the data block of -the Stan program. In the Bernoulli model the data is N, the -number of data points, and y an integer array of -observations.

-
mod$print()
-
# data block has 'N' and 'y'
-data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1))
-fit <- mod$sample(data = data_list)
-

Because CmdStan doesn’t accept lists of R objects, CmdStanR will -first write the data to a temporary JSON file using -write_stan_json(). This happens internally, but it is also -possible to call write_stan_json() directly.

-
data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1))
-json_file <- tempfile(fileext = ".json")
-write_stan_json(data_list, json_file)
-cat(readLines(json_file), sep = "\n")
-
-
-

JSON file

-

If you already have your data in a JSON file you can just pass that -file directly to CmdStanR instead of using a list of R objects. For -example, we could pass in the JSON file we created above using -write_stan_json():

-
fit <- mod$sample(data = json_file)
-
-
-

R dump file

-

Finally, it is also possible to use the R dump file format. This is -not recommended because CmdStan can process JSON faster than R -dump, but CmdStanR allows it because CmdStan will accept files created -by rstan::stan_rdump():

-
rdump_file <- tempfile(fileext = ".data.R")
-rstan::stan_rdump(names(data_list), file = rdump_file, envir = list2env(data_list))
-cat(readLines(rdump_file), sep = "\n")
-fit <- mod$sample(data = rdump_file)
-
-
-
-

Writing CmdStan output to CSV

-
-

Default temporary files

-
data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1))
-fit <- mod$sample(data = data_list)
-

When fitting a model, the default behavior is to write the output -from CmdStan to CSV files in a temporary directory.

-
fit$output_files()
-

These files will be lost if you end your R session or if you remove -the fit object and force (or wait for) garbage -collection.

-
files <- fit$output_files()
-file.exists(files)
-
-rm(fit)
-gc()
-
-file.exists(files)
-
-
-

Non-temporary files

-

To save these files to a non-temporary location there are two -options. You can either specify the output_dir argument to -mod$sample() or use fit$save_output_files() -after fitting the model.

-
# see ?save_output_files for info on optional arguments
-fit$save_output_files(dir = "path/to/directory")
-
fit <- mod$sample(
-  data = data_list,
-  output_dir = "path/to/directory"
-)
-
-
-
-

Reading CmdStan output into R

-
-

Lazy CSV reading

-

With the exception of some diagnostic information, the CSV files are -not read into R until their contents are requested by calling a method -that requires them (e.g., fit$draws(), -fit$summary(), etc.). If we examine the structure of the -fit object, notice how the Private slot -draws_ is NULL, indicating that the CSV files -haven’t yet been read into R.

-
str(fit)
-

After we call a method that requires the draws then if we reexamine -the structure of the object we will see that the draws_ -slot in Private is no longer empty.

-
draws <- fit$draws() # force CSVs to be read into R
-str(fit)
-

For models with many parameters, transformed parameters, or generated -quantities, if only some are requested (e.g., by specifying the -variables argument to fit$draws()) then -CmdStanR will only read in the requested variables (unless they have -already been read in).

-
-
-

read_cmdstan_csv()

-

Internally, the read_cmdstan_csv() function is used to -read the CmdStan CSV files into R. This function is exposed to users, so -you can also call it directly.

-
# see ?read_cmdstan_csv for info on optional arguments controlling
-# what information is read in
-csv_contents <- read_cmdstan_csv(fit$output_files())
-str(csv_contents)
-
-
-

as_cmdstan_fit()

-

If you need to manually create fitted model objects from CmdStan CSV -files use as_cmdstan_fit().

-
fit2 <- as_cmdstan_fit(fit$output_files())
-

This is pointless in our case since we have the original -fit object, but this function can be used to create fitted -model objects (CmdStanMCMC, CmdStanMLE, etc.) -from any CmdStan CSV files.

-
-
-

Saving and accessing advanced algorithm info (latent dynamics)

-

If save_latent_dynamics is set to TRUE when -running the $sample() method then additional CSV files are -created (one per chain) that provide access to quantities used under the -hood by Stan’s implementation of dynamic Hamiltonian Monte Carlo.

-

CmdStanR does not yet provide a special method for processing these -files but they can be read into R using R’s standard CSV reading -functions.

-
fit <- mod$sample(data = data_list, save_latent_dynamics = TRUE)
-
fit$latent_dynamics_files()
-
-# read one of the files in
-x <- utils::read.csv(fit$latent_dynamics_files()[1], comment.char = "#")
-head(x)
-

The column lp__ is also provided via -fit$draws(), and the columns accept_stat__, -stepsize__, treedepth__, -n_leapfrog__, divergent__, and -energy__ are also provided by -fit$sampler_diagnostics(), but there are several columns -unique to the latent dynamics file.

-
head(x[, c("theta", "p_theta", "g_theta")])
-

Our model has a single parameter theta and the three -columns above correspond to theta in the -unconstrained space (theta on the constrained -space is accessed via fit$draws()), the auxiliary momentum -p_theta, and the gradient g_theta. In general, -each of these three columns will exist for every parameter in -the model.

-
-
-
-

Developing using CmdStanR

-

CmdStanR can of course be used for developing other packages that -require compiling and running Stan models as well as using new or custom -Stan features available through CmdStan.

-
-

Pre-compiled Stan models in R packages

-

You may compile a Stan model at runtime (e.g. just before sampling), -or you may compile all the models inside the package file system in -advance at installation time. The latter avoids compilations at runtime, -which matters in centrally managed R installations where users should -not compile their own software.

-

To pre-compile all the models in a package, you may create top-level -scripts configure and configure.win which run -cmdstan_model() with compile = TRUE and save -the compiled executables somewhere inside the inst/ folder -of the package source. The instantiate -package helps developers configure packages this way, and it documents -other topics such as submitting to CRAN and administering CmdStan. Kevin -Ushey’s configure -package helps create and manage package configuration files in -general.

-
-
-

Troubleshooting and debugging

-

When developing or testing new features it might be useful to have -more information on how CmdStan is called internally and to see more -information printed when compiling or running models. This can be -enabled for an entire R session by setting the option -"cmdstanr_verbose" to TRUE.

-
options("cmdstanr_verbose"=TRUE)
-
-mod <- cmdstan_model(stan_file, force_recompile = TRUE)
-fit <- mod$sample(
-  data = data_list,
-  chains = 1,
-  iter_warmup = 100,
-  iter_sampling = 100
-)
-
-
- - - - - - - - - - - diff --git a/doc/cmdstanr.R b/doc/cmdstanr.R deleted file mode 100644 index ce0e942f..00000000 --- a/doc/cmdstanr.R +++ /dev/null @@ -1,230 +0,0 @@ -params <- -list(EVAL = FALSE) - -## ----settings-knitr, include=FALSE-------------------------------------------- -stopifnot(require(knitr)) -opts_chunk$set( - # collapse = TRUE, - dev = "png", - dpi = 150, - fig.asp = 0.618, - fig.width = 5, - out.width = "60%", - fig.align = "center", - comment = NA, - eval = if (isTRUE(exists("params"))) params$EVAL else FALSE -) - -## ----install, eval=FALSE------------------------------------------------------ -# # we recommend running this is a fresh R session or restarting your current session -# install.packages("cmdstanr", repos = c("https://mc-stan.org/r-packages/", getOption("repos"))) - -## ----library, message=FALSE--------------------------------------------------- -# library(cmdstanr) -# library(posterior) -# library(bayesplot) -# color_scheme_set("brightblue") - -## ----check-toolchain---------------------------------------------------------- -# check_cmdstan_toolchain() - -## ----install_cmdstan-1, include = FALSE--------------------------------------- -# if (!dir.exists(cmdstan_default_path())) { -# install_cmdstan() -# } - -## ----install_cmdstan-2, eval=FALSE-------------------------------------------- -# install_cmdstan(cores = 2) - -## ----set_cmdstan_path, eval=FALSE--------------------------------------------- -# set_cmdstan_path(PATH_TO_CMDSTAN) - -## ----cmdstan_path------------------------------------------------------------- -# cmdstan_path() -# cmdstan_version() - -## ----cmdstan_model------------------------------------------------------------ -# file <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli.stan") -# mod <- cmdstan_model(file) - -## ----compile------------------------------------------------------------------ -# mod$print() - -## ----exe_file----------------------------------------------------------------- -# mod$exe_file() - -## ----sample------------------------------------------------------------------- -# # names correspond to the data block in the Stan program -# data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) -# -# fit <- mod$sample( -# data = data_list, -# seed = 123, -# chains = 4, -# parallel_chains = 4, -# refresh = 500 # print update every 500 iters -# ) - -## ----summary, eval=FALSE------------------------------------------------------ -# fit$summary() -# fit$summary(variables = c("theta", "lp__"), "mean", "sd") -# -# # use a formula to summarize arbitrary functions, e.g. Pr(theta <= 0.5) -# fit$summary("theta", pr_lt_half = ~ mean(. <= 0.5)) -# -# # summarise all variables with default and additional summary measures -# fit$summary( -# variables = NULL, -# posterior::default_summary_measures(), -# extra_quantiles = ~posterior::quantile2(., probs = c(.0275, .975)) -# ) - -## ---- echo=FALSE-------------------------------------------------------------- -# # NOTE: the hack of using print.data.frame in chunks with echo=FALSE -# # is used because the pillar formatting of posterior draws_summary objects -# # isn't playing nicely with pkgdown::build_articles(). -# options(digits = 2) -# print.data.frame(fit$summary()) -# print.data.frame(fit$summary(variables = c("theta", "lp__"), "mean", "sd")) -# print.data.frame(fit$summary("theta", pr_lt_half = ~ mean(. <= 0.5))) -# print.data.frame(fit$summary( -# variables = NULL, -# posterior::default_summary_measures(), -# extra_quantiles = ~posterior::quantile2(., probs = c(.0275, .975)) -# )) - -## ----draws, message=FALSE----------------------------------------------------- -# # default is a 3-D draws_array object from the posterior package -# # iterations x chains x variables -# draws_arr <- fit$draws() # or format="array" -# str(draws_arr) -# -# # draws x variables data frame -# draws_df <- fit$draws(format = "df") -# str(draws_df) -# print(draws_df) - -## ----as_draws----------------------------------------------------------------- -# # this should be identical to draws_df created via draws(format = "df") -# draws_df_2 <- as_draws_df(draws_arr) -# identical(draws_df, draws_df_2) - -## ----plots, message=FALSE----------------------------------------------------- -# mcmc_hist(fit$draws("theta")) - -## ----sampler_diagnostics------------------------------------------------------ -# # this is a draws_array object from the posterior package -# str(fit$sampler_diagnostics()) -# -# # this is a draws_df object from the posterior package -# str(fit$sampler_diagnostics(format = "df")) - -## ----diagnostic_summary------------------------------------------------------- -# fit$diagnostic_summary() - -## ----fit-with-warnings, results='hold'---------------------------------------- -# fit_with_warning <- cmdstanr_example("schools") - -## ----diagnostic_summary-with-warnings----------------------------------------- -# diagnostics <- fit_with_warning$diagnostic_summary() -# print(diagnostics) -# -# # number of divergences reported in warning is the sum of the per chain values -# sum(diagnostics$num_divergent) - -## ----stanfit, eval=FALSE------------------------------------------------------ -# stanfit <- rstan::read_stan_csv(fit$output_files()) - -## ----optimize----------------------------------------------------------------- -# fit_mle <- mod$optimize(data = data_list, seed = 123) -# fit_mle$print() # includes lp__ (log prob calculated by Stan program) -# fit_mle$mle("theta") - -## ----plot-mle, message = FALSE------------------------------------------------ -# mcmc_hist(fit$draws("theta")) + -# vline_at(fit_mle$mle("theta"), size = 1.5) - -## ----optimize-map------------------------------------------------------------- -# fit_map <- mod$optimize( -# data = data_list, -# jacobian = TRUE, -# seed = 123 -# ) - -## ----laplace------------------------------------------------------------------ -# fit_laplace <- mod$laplace( -# mode = fit_map, -# draws = 4000, -# data = data_list, -# seed = 123, -# refresh = 1000 -# ) -# fit_laplace$print("theta") -# mcmc_hist(fit_laplace$draws("theta"), binwidth = 0.025) - -## ----variational-------------------------------------------------------------- -# fit_vb <- mod$variational( -# data = data_list, -# seed = 123, -# draws = 4000 -# ) -# fit_vb$print("theta") -# mcmc_hist(fit_vb$draws("theta"), binwidth = 0.025) - -## ----pathfinder--------------------------------------------------------------- -# fit_pf <- mod$pathfinder( -# data = data_list, -# seed = 123, -# draws = 4000 -# ) -# fit_pf$print("theta") - -## ----plot-compare-pf, message = FALSE----------------------------------------- -# mcmc_hist(fit_pf$draws("theta"), binwidth = 0.025) + -# ggplot2::labs(subtitle = "Approximate posterior from pathfinder") + -# ggplot2::xlim(0, 1) - -## ----plot-compare-vb, message = FALSE----------------------------------------- -# mcmc_hist(fit_vb$draws("theta"), binwidth = 0.025) + -# ggplot2::labs(subtitle = "Approximate posterior from variational") + -# ggplot2::xlim(0, 1) - -## ----plot-compare-laplace, message = FALSE------------------------------------ -# mcmc_hist(fit_laplace$draws("theta"), binwidth = 0.025) + -# ggplot2::labs(subtitle = "Approximate posterior from Laplace") + -# ggplot2::xlim(0, 1) - -## ----plot-compare-mcmc, message = FALSE--------------------------------------- -# mcmc_hist(fit$draws("theta"), binwidth = 0.025) + -# ggplot2::labs(subtitle = "Posterior from MCMC") + -# ggplot2::xlim(0, 1) - -## ----save_object, eval=FALSE-------------------------------------------------- -# fit$save_object(file = "fit.RDS") -# -# # can be read back in using readRDS -# fit2 <- readRDS("fit.RDS") - -## ----save_object_qs_full, eval = FALSE---------------------------------------- -# # Load CmdStan output files into the fitted model object. -# fit$draws() # Load posterior draws into the object. -# try(fit$sampler_diagnostics(), silent = TRUE) # Load sampler diagnostics. -# try(fit$init(), silent = TRUE) # Load user-defined initial values. -# try(fit$profiles(), silent = TRUE) # Load profiling samples. -# -# # Save the object to a file. -# qs::qsave(x = fit, file = "fit.qs") -# -# # Read the object. -# fit2 <- qs::qread("fit.qs") - -## ----save_object_qs_small, eval = FALSE--------------------------------------- -# # Load posterior draws into the fitted model object and omit other output. -# fit$draws() -# -# # Save the object to a file. -# qs::qsave(x = fit, file = "fit.qs") -# -# # Read the object. -# fit2 <- qs::qread("fit.qs") - diff --git a/doc/cmdstanr.Rmd b/doc/cmdstanr.Rmd deleted file mode 100644 index b9b2e341..00000000 --- a/doc/cmdstanr.Rmd +++ /dev/null @@ -1,554 +0,0 @@ ---- -title: "Getting started with CmdStanR" -author: "Jonah Gabry, Rok Češnovar, and Andrew Johnson" -output: - rmarkdown::html_vignette: - toc: true - toc_depth: 3 -params: - EVAL: !r identical(Sys.getenv("NOT_CRAN"), "true") -vignette: > - %\VignetteIndexEntry{Getting started with CmdStanR} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r child="children/_settings-knitr.Rmd"} -``` - -## Introduction - -CmdStanR (Command Stan R) is a lightweight interface to -[Stan](https://mc-stan.org/) for R users that provides an alternative to the -traditional [RStan](https://mc-stan.org/rstan/) interface. See the [*Comparison -with RStan*](#comparison-with-rstan) section later in this vignette for more -details on how the two interfaces differ. - -Using CmdStanR requires installing the **cmdstanr** R package and also -CmdStan, the command line interface to Stan. First we install the R package -by running the following command in R. - -```{r install, eval=FALSE} -# we recommend running this is a fresh R session or restarting your current session -install.packages("cmdstanr", repos = c("https://mc-stan.org/r-packages/", getOption("repos"))) -``` - -We can now load the package like any other R package. We'll also load the -**bayesplot** and **posterior** packages to use later in examples. - -```{r library, message=FALSE} -library(cmdstanr) -library(posterior) -library(bayesplot) -color_scheme_set("brightblue") -``` - -## Installing CmdStan - -CmdStanR requires a working installation of -[CmdStan](https://mc-stan.org/users/interfaces/cmdstan.html), the shell -interface to Stan. If you don't have CmdStan installed then CmdStanR can install -it for you, assuming you have a suitable C++ toolchain. The requirements are -described in the CmdStan Guide: - -* https://mc-stan.org/docs/cmdstan-guide/cmdstan-installation.html - -To double check that your toolchain is set up properly you can call -the `check_cmdstan_toolchain()` function: - -```{r check-toolchain} -check_cmdstan_toolchain() -``` - -If your toolchain is configured correctly then CmdStan can be installed by -calling the -[`install_cmdstan()`](https://mc-stan.org/cmdstanr/reference/install_cmdstan.html) -function: - -```{r install_cmdstan-1, include = FALSE} -if (!dir.exists(cmdstan_default_path())) { - install_cmdstan() -} -``` -```{r install_cmdstan-2, eval=FALSE} -install_cmdstan(cores = 2) -``` - -Before CmdStanR can be used it needs to know where the CmdStan installation is -located. When the package is loaded it tries to help automate this to avoid -having to manually set the path every session: - -1. If the environment variable `"CMDSTAN"` exists at load time then its value -will be automatically set as the default path to CmdStan for the R session. This -is useful if your CmdStan installation is not located in the default directory -that would have been used by `install_cmdstan()` (see #2). - -2. If no environment variable is found when loaded but any directory in the form -`".cmdstan/cmdstan-[version]"`, for example `".cmdstan/cmdstan-2.23.0"`, -exists in the user's home directory (`Sys.getenv("HOME")`, -*not* the current working directory) then the path to the CmdStan with the -largest version number will be set as the path to CmdStan for the R session. -This is the same as the default directory that `install_cmdstan()` uses to -install the latest version of CmdStan, so if that's how you installed CmdStan -you shouldn't need to manually set the path to CmdStan when loading CmdStanR. - -If neither of these applies (or you want to subsequently change the path) you -can use the `set_cmdstan_path()` function: - -```{r set_cmdstan_path, eval=FALSE} -set_cmdstan_path(PATH_TO_CMDSTAN) -``` - -To check the path to the CmdStan installation and the CmdStan version number -you can use `cmdstan_path()` and `cmdstan_version()`: - -```{r cmdstan_path} -cmdstan_path() -cmdstan_version() -``` - -## Compiling a model - -The `cmdstan_model()` function creates a new -[`CmdStanModel`](https://mc-stan.org/cmdstanr/reference/CmdStanModel.html) -object from a file containing a Stan program. Under the hood, CmdStan is called -to translate a Stan program to C++ and create a compiled executable. Here we'll -use the example Stan program that comes with the CmdStan installation: - -```{r cmdstan_model} -file <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli.stan") -mod <- cmdstan_model(file) -``` - -The object `mod` is an [R6](https://r6.r-lib.org/) reference object of class -[`CmdStanModel`](https://mc-stan.org/cmdstanr/reference/CmdStanModel.html) and -behaves similarly to R's reference class objects and those in object oriented -programming languages. Methods are accessed using the `$` operator. This design -choice allows for CmdStanR and -[CmdStanPy](https://github.com/stan-dev/cmdstanpy) to provide a similar user -experience and share many implementation details. - -The Stan program can be printed using the `$print()` method: - -```{r compile} -mod$print() -``` - -The path to the compiled executable is returned by the `$exe_file()` -method: - -```{r exe_file} -mod$exe_file() -``` - -## Running MCMC - -The -[`$sample()`](https://mc-stan.org/cmdstanr/reference/model-method-sample.html) -method for -[`CmdStanModel`](https://mc-stan.org/cmdstanr/reference/CmdStanModel.html) -objects runs Stan's default MCMC algorithm. The `data` argument accepts a named -list of R objects (like for RStan) or a path to a data file compatible with -CmdStan (JSON or R dump). - -```{r sample} -# names correspond to the data block in the Stan program -data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1)) - -fit <- mod$sample( - data = data_list, - seed = 123, - chains = 4, - parallel_chains = 4, - refresh = 500 # print update every 500 iters -) -``` - -There are many more arguments that can be passed to the `$sample()` method. -For details follow this link to its separate documentation page: - -* [`$sample()`](https://mc-stan.org/cmdstanr/reference/model-method-sample.html) - -The `$sample()` method creates [R6](https://r6.r-lib.org/) `CmdStanMCMC` -objects, which have many associated methods. Below we will demonstrate some of -the most important methods. For a full list, follow this link to the -`CmdStanMCMC` documentation: - -* [`CmdStanMCMC`](https://mc-stan.org/cmdstanr/reference/CmdStanMCMC.html) - -### Posterior summary statistics - -#### Summaries from the posterior package - -The -[`$summary()`](https://mc-stan.org/cmdstanr/reference/fit-method-summary.html) -method calls `summarise_draws()` from the **posterior** package. The -first argument specifies the variables to summarize and any arguments -after that are passed on to `posterior::summarise_draws()` to specify -which summaries to compute, whether to use multiple cores, etc. - -```{r summary, eval=FALSE} -fit$summary() -fit$summary(variables = c("theta", "lp__"), "mean", "sd") - -# use a formula to summarize arbitrary functions, e.g. Pr(theta <= 0.5) -fit$summary("theta", pr_lt_half = ~ mean(. <= 0.5)) - -# summarise all variables with default and additional summary measures -fit$summary( - variables = NULL, - posterior::default_summary_measures(), - extra_quantiles = ~posterior::quantile2(., probs = c(.0275, .975)) -) -``` - -```{r, echo=FALSE} - # NOTE: the hack of using print.data.frame in chunks with echo=FALSE - # is used because the pillar formatting of posterior draws_summary objects - # isn't playing nicely with pkgdown::build_articles(). - options(digits = 2) - print.data.frame(fit$summary()) - print.data.frame(fit$summary(variables = c("theta", "lp__"), "mean", "sd")) - print.data.frame(fit$summary("theta", pr_lt_half = ~ mean(. <= 0.5))) - print.data.frame(fit$summary( - variables = NULL, - posterior::default_summary_measures(), - extra_quantiles = ~posterior::quantile2(., probs = c(.0275, .975)) - )) -``` - -#### CmdStan's stansummary utility - -CmdStan itself provides a `stansummary` utility that can be called using the -`$cmdstan_summary()` method. This method will print summaries but won't return -anything. - -### Posterior draws - -#### Extracting draws - -The [`$draws()`](https://mc-stan.org/cmdstanr/reference/fit-method-draws.html) -method can be used to extract the posterior draws in formats provided by the -[**posterior**](https://mc-stan.org/posterior/) package. Here we demonstrate -only the `draws_array` and `draws_df` formats, but the **posterior** package -supports other useful formats as well. - -```{r draws, message=FALSE} -# default is a 3-D draws_array object from the posterior package -# iterations x chains x variables -draws_arr <- fit$draws() # or format="array" -str(draws_arr) - -# draws x variables data frame -draws_df <- fit$draws(format = "df") -str(draws_df) -print(draws_df) -``` - -To convert an existing draws object to a different format use the -`posterior::as_draws_*()` functions. - -```{r as_draws} -# this should be identical to draws_df created via draws(format = "df") -draws_df_2 <- as_draws_df(draws_arr) -identical(draws_df, draws_df_2) -``` - -In general, converting to a different draws format in this way will be slower -than just setting the appropriate format initially in the call to the `$draws()` -method, but in most cases the speed difference will be minor. - -#### Plotting draws - -Plotting posterior distributions is as easy as passing the object returned by -the `$draws()` method directly to plotting functions in our -[**bayesplot**](https://mc-stan.org/bayesplot/) package. - -```{r plots, message=FALSE} -mcmc_hist(fit$draws("theta")) -``` - -### Sampler diagnostics - -#### Extracting diagnostic values for each iteration and chain - -The -[`$sampler_diagnostics()`](https://mc-stan.org/cmdstanr/reference/fit-method-sampler_diagnostics.html) -method extracts the values of the sampler parameters (`treedepth__`, -`divergent__`, etc.) in formats supported by the **posterior** package. The -default is as a 3-D array (iteration x chain x variable). - -```{r sampler_diagnostics} -# this is a draws_array object from the posterior package -str(fit$sampler_diagnostics()) - -# this is a draws_df object from the posterior package -str(fit$sampler_diagnostics(format = "df")) -``` - -#### Sampler diagnostic warnings and summaries - -The `$diagnostic_summary()` method will display any sampler diagnostic warnings and return a summary of diagnostics for each chain. - -```{r diagnostic_summary} -fit$diagnostic_summary() -``` - -We see the number of divergences for each of the four chains, the number -of times the maximum treedepth was hit for each chain, and the E-BFMI -for each chain. - -In this case there were no warnings, so in order to demonstrate the warning -messages we'll use one of the CmdStanR example models that suffers from -divergences. - -```{r fit-with-warnings, results='hold'} -fit_with_warning <- cmdstanr_example("schools") -``` -After fitting there is a warning about divergences. We can also regenerate this warning message later using `fit$diagnostic_summary()`. - -```{r diagnostic_summary-with-warnings} -diagnostics <- fit_with_warning$diagnostic_summary() -print(diagnostics) - -# number of divergences reported in warning is the sum of the per chain values -sum(diagnostics$num_divergent) -``` - -#### CmdStan's diagnose utility - -CmdStan itself provides a `diagnose` utility that can be called using -the `$cmdstan_diagnose()` method. This method will print warnings but won't return anything. - - -### Create a `stanfit` object - -If you have RStan installed then it is also possible to create a `stanfit` -object from the csv output files written by CmdStan. This can be done by using -`rstan::read_stan_csv()` in combination with the `$output_files()` method of the -`CmdStanMCMC` object. This is only needed if you want to fit a model with -CmdStanR but already have a lot of post-processing code that assumes a `stanfit` -object. Otherwise we recommend using the post-processing functionality provided -by CmdStanR itself. - -```{r stanfit, eval=FALSE} -stanfit <- rstan::read_stan_csv(fit$output_files()) -``` - - - -## Running optimization and variational inference - -CmdStanR also supports running Stan's optimization algorithms and its algorithms -for variational approximation of full Bayesian inference. These are run via the -`$optimize()`, `$laplace()`, `$variational()`, and `$pathfinder()` methods, which -are called in a similar way to the `$sample()` method demonstrated above. - -### Optimization - -We can find the (penalized) maximum likelihood estimate (MLE) using [`$optimize()`](https://mc-stan.org/cmdstanr/reference/model-method-optimize.html). - -```{r optimize} -fit_mle <- mod$optimize(data = data_list, seed = 123) -fit_mle$print() # includes lp__ (log prob calculated by Stan program) -fit_mle$mle("theta") -``` - -Here's a plot comparing the penalized MLE to the posterior distribution of -`theta`. - -```{r plot-mle, message = FALSE} -mcmc_hist(fit$draws("theta")) + - vline_at(fit_mle$mle("theta"), size = 1.5) -``` - -For optimization, by default the mode is calculated without the Jacobian -adjustment for constrained variables, which shifts the mode due to the change of -variables. To include the Jacobian adjustment and obtain a maximum a posteriori -(MAP) estimate set `jacobian=TRUE`. See the -[Maximum Likelihood Estimation](https://mc-stan.org/docs/cmdstan-guide/maximum-likelihood-estimation.html) -section of the CmdStan User's Guide for more details. - -```{r optimize-map} -fit_map <- mod$optimize( - data = data_list, - jacobian = TRUE, - seed = 123 -) -``` - -### Laplace Approximation - -The [`$laplace()`](https://mc-stan.org/cmdstanr/reference/model-method-laplace.html) -method produces a sample from a normal approximation centered at the mode of a -distribution in the unconstrained space. If the mode is a MAP estimate, the -samples provide an estimate of the mean and standard deviation of the posterior -distribution. If the mode is the MLE, the sample provides an estimate of the -standard error of the likelihood. Whether the mode is the MAP or MLE depends on -the value of the `jacobian` argument when running optimization. See the -[Laplace Sampling](https://mc-stan.org/docs/cmdstan-guide/laplace-sampling.html) -chapter of the CmdStan User's Guide for more details. - -Here we pass in the `fit_map` object from above as the `mode` argument. If -`mode` is omitted then optimization will be run internally before taking draws -from the normal approximation. - -```{r laplace} -fit_laplace <- mod$laplace( - mode = fit_map, - draws = 4000, - data = data_list, - seed = 123, - refresh = 1000 - ) -fit_laplace$print("theta") -mcmc_hist(fit_laplace$draws("theta"), binwidth = 0.025) -``` - -### Variational (ADVI) - -We can run Stan's experimental Automatic Differentiation Variational Inference -(ADVI) using the [`$variational()`](https://mc-stan.org/cmdstanr/reference/model-method-variational.html) -method. For details on the ADVI algorithm see the -[CmdStan User's Guide](https://mc-stan.org/docs/cmdstan-guide/variational-inference-algorithm-advi.html). - - -```{r variational} -fit_vb <- mod$variational( - data = data_list, - seed = 123, - draws = 4000 -) -fit_vb$print("theta") -mcmc_hist(fit_vb$draws("theta"), binwidth = 0.025) -``` - -### Variational (Pathfinder) - -Stan version 2.33 introduced a new variational method called Pathfinder, -which is intended to be faster and more stable than ADVI. For details on how -Pathfinder works see the section in the -[CmdStan User's Guide](https://mc-stan.org/docs/cmdstan-guide/pathfinder-intro.html#pathfinder-intro). -Pathfinder is run using the [`$pathfinder()`](https://mc-stan.org/cmdstanr/reference/model-method-pathfinder.html) -method. - -```{r pathfinder} -fit_pf <- mod$pathfinder( - data = data_list, - seed = 123, - draws = 4000 -) -fit_pf$print("theta") -``` - - -Let's extract the draws, make the same plot we made after running the other -algorithms, and compare them all. approximation, and compare them all. In this -simple example the distributions are quite similar, but this will not always be -the case for more challenging problems. - -```{r plot-compare-pf, message = FALSE} -mcmc_hist(fit_pf$draws("theta"), binwidth = 0.025) + - ggplot2::labs(subtitle = "Approximate posterior from pathfinder") + - ggplot2::xlim(0, 1) -``` -```{r plot-compare-vb, message = FALSE} -mcmc_hist(fit_vb$draws("theta"), binwidth = 0.025) + - ggplot2::labs(subtitle = "Approximate posterior from variational") + - ggplot2::xlim(0, 1) -``` -```{r plot-compare-laplace, message = FALSE} -mcmc_hist(fit_laplace$draws("theta"), binwidth = 0.025) + - ggplot2::labs(subtitle = "Approximate posterior from Laplace") + - ggplot2::xlim(0, 1) -``` -```{r plot-compare-mcmc, message = FALSE} -mcmc_hist(fit$draws("theta"), binwidth = 0.025) + - ggplot2::labs(subtitle = "Posterior from MCMC") + - ggplot2::xlim(0, 1) -``` - -For more details on the `$optimize()`, `$laplace()`, `$variational()`, and -`pathfinder()` methods, follow these links to their documentation pages. - -* [`$optimize()`](https://mc-stan.org/cmdstanr/reference/model-method-optimize.html) -* [`$laplace()`](https://mc-stan.org/cmdstanr/reference/model-method-laplace.html) -* [`$variational()`](https://mc-stan.org/cmdstanr/reference/model-method-variational.html) -* [`$pathfinder()`](https://mc-stan.org/cmdstanr/reference/model-method-pathfinder.html) - - -## Saving fitted model objects - -The [`$save_object()`](http://mc-stan.org/cmdstanr/reference/fit-method-save_object.html) -method provided by CmdStanR is the most convenient way to save a fitted model object -to disk and ensure that all of the contents are available when reading the object back into R. - -```{r save_object, eval=FALSE} -fit$save_object(file = "fit.RDS") - -# can be read back in using readRDS -fit2 <- readRDS("fit.RDS") -``` - -But if your model object is large, then -[`$save_object()`](http://mc-stan.org/cmdstanr/reference/fit-method-save_object.html) -could take a long time. -[`$save_object()`](http://mc-stan.org/cmdstanr/reference/fit-method-save_object.html) -reads the CmdStan results files into memory, stores them in the model object, -and saves the object with `saveRDS()`. To speed up the process, you can emulate -[`$save_object()`](http://mc-stan.org/cmdstanr/reference/fit-method-save_object.html) -and replace `saveRDS` with the much faster `qsave()` function from the -[`qs`](https://github.com/traversc/qs) package. - -```{r save_object_qs_full, eval = FALSE} -# Load CmdStan output files into the fitted model object. -fit$draws() # Load posterior draws into the object. -try(fit$sampler_diagnostics(), silent = TRUE) # Load sampler diagnostics. -try(fit$init(), silent = TRUE) # Load user-defined initial values. -try(fit$profiles(), silent = TRUE) # Load profiling samples. - -# Save the object to a file. -qs::qsave(x = fit, file = "fit.qs") - -# Read the object. -fit2 <- qs::qread("fit.qs") -``` - -Storage is even faster if you discard results you do not need to save. -The following example saves only posterior draws and discards -sampler diagnostics, user-specified initial values, and profiling data. - -```{r save_object_qs_small, eval = FALSE} -# Load posterior draws into the fitted model object and omit other output. -fit$draws() - -# Save the object to a file. -qs::qsave(x = fit, file = "fit.qs") - -# Read the object. -fit2 <- qs::qread("fit.qs") -``` - -See the vignette [_How does CmdStanR work?_](http://mc-stan.org/cmdstanr/articles/cmdstanr-internals.html) -for more information about the composition of CmdStanR objects. - -## Comparison with RStan - -```{r child="children/comparison-with-rstan.md"} -``` - -## Additional resources - -There are additional vignettes available that discuss other aspects of using -CmdStanR. These can be found online at the CmdStanR website: - -* https://mc-stan.org/cmdstanr/articles/index.html - -To ask a question please post on the Stan forums: - -* https://discourse.mc-stan.org/ - -To report a bug, suggest a feature (including additions to these vignettes), or to start contributing to CmdStanR -development (new contributors welcome!) please open an issue on GitHub: - -* https://github.com/stan-dev/cmdstanr/issues diff --git a/doc/cmdstanr.html b/doc/cmdstanr.html deleted file mode 100644 index 95a23794..00000000 --- a/doc/cmdstanr.html +++ /dev/null @@ -1,814 +0,0 @@ - - - - - - - - - - - - - - - -Getting started with CmdStanR - - - - - - - - - - - - - - - - - - - - - - - - - - -

Getting started with CmdStanR

-

Jonah Gabry, Rok Češnovar, and Andrew Johnson

- - - - -
-

Introduction

-

CmdStanR (Command Stan R) is a lightweight interface to Stan for R users that provides an -alternative to the traditional RStan interface. See the Comparison with RStan section -later in this vignette for more details on how the two interfaces -differ.

-

Using CmdStanR requires installing the cmdstanr R -package and also CmdStan, the command line interface to Stan. First we -install the R package by running the following command in R.

-
# we recommend running this is a fresh R session or restarting your current session
-install.packages("cmdstanr", repos = c("https://mc-stan.org/r-packages/", getOption("repos")))
-

We can now load the package like any other R package. We’ll also load -the bayesplot and posterior packages -to use later in examples.

-
library(cmdstanr)
-library(posterior)
-library(bayesplot)
-color_scheme_set("brightblue")
-
-
-

Installing CmdStan

-

CmdStanR requires a working installation of CmdStan, -the shell interface to Stan. If you don’t have CmdStan installed then -CmdStanR can install it for you, assuming you have a suitable C++ -toolchain. The requirements are described in the CmdStan Guide:

- -

To double check that your toolchain is set up properly you can call -the check_cmdstan_toolchain() function:

-
check_cmdstan_toolchain()
-

If your toolchain is configured correctly then CmdStan can be -installed by calling the install_cmdstan() -function:

-
install_cmdstan(cores = 2)
-

Before CmdStanR can be used it needs to know where the CmdStan -installation is located. When the package is loaded it tries to help -automate this to avoid having to manually set the path every -session:

-
    -
  1. If the environment variable "CMDSTAN" exists at load -time then its value will be automatically set as the default path to -CmdStan for the R session. This is useful if your CmdStan installation -is not located in the default directory that would have been used by -install_cmdstan() (see #2).

  2. -
  3. If no environment variable is found when loaded but any directory -in the form ".cmdstan/cmdstan-[version]", for example -".cmdstan/cmdstan-2.23.0", exists in the user’s home -directory (Sys.getenv("HOME"), not the current -working directory) then the path to the CmdStan with the largest version -number will be set as the path to CmdStan for the R session. This is the -same as the default directory that install_cmdstan() uses -to install the latest version of CmdStan, so if that’s how you installed -CmdStan you shouldn’t need to manually set the path to CmdStan when -loading CmdStanR.

  4. -
-

If neither of these applies (or you want to subsequently change the -path) you can use the set_cmdstan_path() function:

-
set_cmdstan_path(PATH_TO_CMDSTAN)
-

To check the path to the CmdStan installation and the CmdStan version -number you can use cmdstan_path() and -cmdstan_version():

-
cmdstan_path()
-cmdstan_version()
-
-
-

Compiling a model

-

The cmdstan_model() function creates a new CmdStanModel -object from a file containing a Stan program. Under the hood, CmdStan is -called to translate a Stan program to C++ and create a compiled -executable. Here we’ll use the example Stan program that comes with the -CmdStan installation:

-
file <- file.path(cmdstan_path(), "examples", "bernoulli", "bernoulli.stan")
-mod <- cmdstan_model(file)
-

The object mod is an R6 reference object of class CmdStanModel -and behaves similarly to R’s reference class objects and those in object -oriented programming languages. Methods are accessed using the -$ operator. This design choice allows for CmdStanR and CmdStanPy to provide a -similar user experience and share many implementation details.

-

The Stan program can be printed using the $print() -method:

-
mod$print()
-

The path to the compiled executable is returned by the -$exe_file() method:

-
mod$exe_file()
-
-
-

Running MCMC

-

The $sample() -method for CmdStanModel -objects runs Stan’s default MCMC algorithm. The data -argument accepts a named list of R objects (like for RStan) or a path to -a data file compatible with CmdStan (JSON or R dump).

-
# names correspond to the data block in the Stan program
-data_list <- list(N = 10, y = c(0,1,0,0,0,0,0,0,0,1))
-
-fit <- mod$sample(
-  data = data_list,
-  seed = 123,
-  chains = 4,
-  parallel_chains = 4,
-  refresh = 500 # print update every 500 iters
-)
-

There are many more arguments that can be passed to the -$sample() method. For details follow this link to its -separate documentation page:

- -

The $sample() method creates R6 CmdStanMCMC objects, -which have many associated methods. Below we will demonstrate some of -the most important methods. For a full list, follow this link to the -CmdStanMCMC documentation:

- -
-

Posterior summary statistics

-
-

Summaries from the posterior package

-

The $summary() -method calls summarise_draws() from the -posterior package. The first argument specifies the -variables to summarize and any arguments after that are passed on to -posterior::summarise_draws() to specify which summaries to -compute, whether to use multiple cores, etc.

-
fit$summary()
-fit$summary(variables = c("theta", "lp__"), "mean", "sd")
-
-# use a formula to summarize arbitrary functions, e.g. Pr(theta <= 0.5)
-fit$summary("theta", pr_lt_half = ~ mean(. <= 0.5))
-
-# summarise all variables with default and additional summary measures
-fit$summary(
-  variables = NULL,
-  posterior::default_summary_measures(),
-  extra_quantiles = ~posterior::quantile2(., probs = c(.0275, .975))
-)
-
-
-

CmdStan’s stansummary utility

-

CmdStan itself provides a stansummary utility that can -be called using the $cmdstan_summary() method. This method -will print summaries but won’t return anything.

-
-
-
-

Posterior draws

-
-

Extracting draws

-

The $draws() -method can be used to extract the posterior draws in formats provided by -the posterior -package. Here we demonstrate only the draws_array and -draws_df formats, but the posterior -package supports other useful formats as well.

-
# default is a 3-D draws_array object from the posterior package
-# iterations x chains x variables
-draws_arr <- fit$draws() # or format="array"
-str(draws_arr)
-
-# draws x variables data frame
-draws_df <- fit$draws(format = "df")
-str(draws_df)
-print(draws_df)
-

To convert an existing draws object to a different format use the -posterior::as_draws_*() functions.

-
# this should be identical to draws_df created via draws(format = "df")
-draws_df_2 <- as_draws_df(draws_arr)
-identical(draws_df, draws_df_2)
-

In general, converting to a different draws format in this way will -be slower than just setting the appropriate format initially in the call -to the $draws() method, but in most cases the speed -difference will be minor.

-
-
-

Plotting draws

-

Plotting posterior distributions is as easy as passing the object -returned by the $draws() method directly to plotting -functions in our bayesplot -package.

-
mcmc_hist(fit$draws("theta"))
-
-
-
-

Sampler diagnostics

-
-

Extracting diagnostic values for each iteration and chain

-

The $sampler_diagnostics() -method extracts the values of the sampler parameters -(treedepth__, divergent__, etc.) in formats -supported by the posterior package. The default is as a -3-D array (iteration x chain x variable).

-
# this is a draws_array object from the posterior package
-str(fit$sampler_diagnostics())
-
-# this is a draws_df object from the posterior package
-str(fit$sampler_diagnostics(format = "df"))
-
-
-

Sampler diagnostic warnings and summaries

-

The $diagnostic_summary() method will display any -sampler diagnostic warnings and return a summary of diagnostics for each -chain.

-
fit$diagnostic_summary()
-

We see the number of divergences for each of the four chains, the -number of times the maximum treedepth was hit for each chain, and the -E-BFMI for each chain.

-

In this case there were no warnings, so in order to demonstrate the -warning messages we’ll use one of the CmdStanR example models that -suffers from divergences.

-
fit_with_warning <- cmdstanr_example("schools")
-

After fitting there is a warning about divergences. We can also -regenerate this warning message later using -fit$diagnostic_summary().

-
diagnostics <- fit_with_warning$diagnostic_summary()
-print(diagnostics)
-
-# number of divergences reported in warning is the sum of the per chain values
-sum(diagnostics$num_divergent)
-
-
-

CmdStan’s diagnose utility

-

CmdStan itself provides a diagnose utility that can be -called using the $cmdstan_diagnose() method. This method -will print warnings but won’t return anything.

-
-
-
-

Create a stanfit object

-

If you have RStan installed then it is also possible to create a -stanfit object from the csv output files written by -CmdStan. This can be done by using rstan::read_stan_csv() -in combination with the $output_files() method of the -CmdStanMCMC object. This is only needed if you want to fit -a model with CmdStanR but already have a lot of post-processing code -that assumes a stanfit object. Otherwise we recommend using -the post-processing functionality provided by CmdStanR itself.

-
stanfit <- rstan::read_stan_csv(fit$output_files())
-
-
-
-

Running optimization and variational inference

-

CmdStanR also supports running Stan’s optimization algorithms and its -algorithms for variational approximation of full Bayesian inference. -These are run via the $optimize(), $laplace(), -$variational(), and $pathfinder() methods, -which are called in a similar way to the $sample() method -demonstrated above.

-
-

Optimization

-

We can find the (penalized) maximum likelihood estimate (MLE) using -$optimize().

-
fit_mle <- mod$optimize(data = data_list, seed = 123)
-fit_mle$print() # includes lp__ (log prob calculated by Stan program)
-fit_mle$mle("theta")
-

Here’s a plot comparing the penalized MLE to the posterior -distribution of theta.

-
mcmc_hist(fit$draws("theta")) +
-  vline_at(fit_mle$mle("theta"), size = 1.5)
-

For optimization, by default the mode is calculated without the -Jacobian adjustment for constrained variables, which shifts the mode due -to the change of variables. To include the Jacobian adjustment and -obtain a maximum a posteriori (MAP) estimate set -jacobian=TRUE. See the Maximum -Likelihood Estimation section of the CmdStan User’s Guide for more -details.

-
fit_map <- mod$optimize(
-  data = data_list,
-  jacobian = TRUE,
-  seed = 123
-)
-
-
-

Laplace Approximation

-

The $laplace() -method produces a sample from a normal approximation centered at the -mode of a distribution in the unconstrained space. If the mode is a MAP -estimate, the samples provide an estimate of the mean and standard -deviation of the posterior distribution. If the mode is the MLE, the -sample provides an estimate of the standard error of the likelihood. -Whether the mode is the MAP or MLE depends on the value of the -jacobian argument when running optimization. See the Laplace -Sampling chapter of the CmdStan User’s Guide for more details.

-

Here we pass in the fit_map object from above as the -mode argument. If mode is omitted then -optimization will be run internally before taking draws from the normal -approximation.

-
fit_laplace <- mod$laplace(
-    mode = fit_map,
-    draws = 4000,
-    data = data_list,
-    seed = 123,
-    refresh = 1000
-  )
-fit_laplace$print("theta")
-mcmc_hist(fit_laplace$draws("theta"), binwidth = 0.025)
-
-
-

Variational (ADVI)

-

We can run Stan’s experimental Automatic Differentiation Variational -Inference (ADVI) using the $variational() -method. For details on the ADVI algorithm see the CmdStan -User’s Guide.

-
fit_vb <- mod$variational(
-  data = data_list,
-  seed = 123,
-  draws = 4000
-)
-fit_vb$print("theta")
-mcmc_hist(fit_vb$draws("theta"), binwidth = 0.025)
-
-
-

Variational (Pathfinder)

-

Stan version 2.33 introduced a new variational method called -Pathfinder, which is intended to be faster and more stable than ADVI. -For details on how Pathfinder works see the section in the CmdStan -User’s Guide. Pathfinder is run using the $pathfinder() -method.

-
fit_pf <- mod$pathfinder(
-  data = data_list,
-  seed = 123,
-  draws = 4000
-)
-fit_pf$print("theta")
-

Let’s extract the draws, make the same plot we made after running the -other algorithms, and compare them all. approximation, and compare them -all. In this simple example the distributions are quite similar, but -this will not always be the case for more challenging problems.

-
mcmc_hist(fit_pf$draws("theta"), binwidth = 0.025) +
-  ggplot2::labs(subtitle = "Approximate posterior from pathfinder") +
-  ggplot2::xlim(0, 1)
-
mcmc_hist(fit_vb$draws("theta"), binwidth = 0.025) +
-  ggplot2::labs(subtitle = "Approximate posterior from variational") +
-  ggplot2::xlim(0, 1)
-
mcmc_hist(fit_laplace$draws("theta"), binwidth = 0.025) +
-  ggplot2::labs(subtitle = "Approximate posterior from Laplace") +
-  ggplot2::xlim(0, 1)
-
mcmc_hist(fit$draws("theta"), binwidth = 0.025) +
-  ggplot2::labs(subtitle = "Posterior from MCMC") +
-  ggplot2::xlim(0, 1)
-

For more details on the $optimize(), -$laplace(), $variational(), and -pathfinder() methods, follow these links to their -documentation pages.

- -
-
-
-

Saving fitted model objects

-

The $save_object() -method provided by CmdStanR is the most convenient way to save a fitted -model object to disk and ensure that all of the contents are available -when reading the object back into R.

-
fit$save_object(file = "fit.RDS")
-
-# can be read back in using readRDS
-fit2 <- readRDS("fit.RDS")
-

But if your model object is large, then $save_object() -could take a long time. $save_object() -reads the CmdStan results files into memory, stores them in the model -object, and saves the object with saveRDS(). To speed up -the process, you can emulate $save_object() -and replace saveRDS with the much faster -qsave() function from the qs package.

-
# Load CmdStan output files into the fitted model object.
-fit$draws() # Load posterior draws into the object.
-try(fit$sampler_diagnostics(), silent = TRUE) # Load sampler diagnostics.
-try(fit$init(), silent = TRUE) # Load user-defined initial values.
-try(fit$profiles(), silent = TRUE) # Load profiling samples.
-
-# Save the object to a file.
-qs::qsave(x = fit, file = "fit.qs")
-
-# Read the object.
-fit2 <- qs::qread("fit.qs")
-

Storage is even faster if you discard results you do not need to -save. The following example saves only posterior draws and discards -sampler diagnostics, user-specified initial values, and profiling -data.

-
# Load posterior draws into the fitted model object and omit other output.
-fit$draws()
-
-# Save the object to a file.
-qs::qsave(x = fit, file = "fit.qs")
-
-# Read the object.
-fit2 <- qs::qread("fit.qs")
-

See the vignette How -does CmdStanR work? for more information about the composition -of CmdStanR objects.

-
-
-

Comparison with RStan

-
-
-

Additional resources

-

There are additional vignettes available that discuss other aspects -of using CmdStanR. These can be found online at the CmdStanR -website:

- -

To ask a question please post on the Stan forums:

- -

To report a bug, suggest a feature (including additions to these -vignettes), or to start contributing to CmdStanR development (new -contributors welcome!) please open an issue on GitHub:

- -
- - - - - - - - - - - diff --git a/doc/posterior.R b/doc/posterior.R deleted file mode 100644 index a990a31c..00000000 --- a/doc/posterior.R +++ /dev/null @@ -1,77 +0,0 @@ -params <- -list(EVAL = FALSE) - -## ----settings-knitr, include=FALSE-------------------------------------------- -stopifnot(require(knitr)) -opts_chunk$set( - # collapse = TRUE, - dev = "png", - dpi = 150, - fig.asp = 0.618, - fig.width = 5, - out.width = "60%", - fig.align = "center", - comment = NA, - eval = if (isTRUE(exists("params"))) params$EVAL else FALSE -) - -## ----include=FALSE------------------------------------------------------------ -# # Needed temporarily to avoiding weird rendering of posterior's tibbles -# # in pkgdown sites -# print.tbl_df <- function(x, ...) { -# print.data.frame(x) -# } - -## ----------------------------------------------------------------------------- -# fit <- cmdstanr::cmdstanr_example("schools", method = "sample") -# fit$summary() - -## ----------------------------------------------------------------------------- -# posterior::default_summary_measures() - -## ----------------------------------------------------------------------------- -# fit$summary(variables = c("mu", "tau")) - -## ----------------------------------------------------------------------------- -# fit$summary(variables = c("mu", "tau"), mean, sd) - -## ----------------------------------------------------------------------------- -# fit$metadata()$model_params -# fit$summary(variables = NULL, "mean", "median") - -## ----------------------------------------------------------------------------- -# my_sd <- function(x) c(My_SD = sd(x)) -# fit$summary( -# c("mu", "tau"), -# MEAN = mean, -# "median", -# my_sd, -# ~quantile(.x, probs = c(0.1, 0.9)), -# Minimum = function(x) min(x) -# ) - -## ----------------------------------------------------------------------------- -# fit$summary(c("mu", "tau"), quantile, .args = list(probs = c(0.025, .05, .95, .975))) - -## ----------------------------------------------------------------------------- -# fit$summary(variables = NULL, dim, colMeans) - -## ----------------------------------------------------------------------------- -# fit$summary(c("mu", "tau"), posterior::variance, ~var(as.vector(.x))) - -## ----------------------------------------------------------------------------- -# strict_pos <- function(x) if (all(x > 0)) "yes" else "no" -# fit$summary(variables = NULL, "Strictly Positive" = strict_pos) -# # fit$print(variables = NULL, "Strictly Positive" = strict_pos) - -## ----draws, message=FALSE----------------------------------------------------- -# # default is a 3-D draws_array object from the posterior package -# # iterations x chains x variables -# draws_arr <- fit$draws() # or format="array" -# str(draws_arr) -# -# # draws x variables data frame -# draws_df <- fit$draws(format = "df") -# str(draws_df) -# print(draws_df) - diff --git a/doc/posterior.Rmd b/doc/posterior.Rmd deleted file mode 100644 index 15da8e68..00000000 --- a/doc/posterior.Rmd +++ /dev/null @@ -1,126 +0,0 @@ ---- -title: "Working with Posteriors" -output: - rmarkdown::html_vignette: - toc: true - toc_depth: 3 -params: - EVAL: !r identical(Sys.getenv("NOT_CRAN"), "true") -vignette: > - %\VignetteIndexEntry{Working with Posteriors} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r child="children/_settings-knitr.Rmd"} -``` - -```{r include=FALSE} -# Needed temporarily to avoiding weird rendering of posterior's tibbles -# in pkgdown sites -print.tbl_df <- function(x, ...) { - print.data.frame(x) -} -``` - -## Summary statistics - -We can easily customize the summary statistics reported by `$summary()` and `$print()`. - -```{r} -fit <- cmdstanr::cmdstanr_example("schools", method = "sample") -fit$summary() -``` - -By default all variables are summaries with the follow functions: -```{r} -posterior::default_summary_measures() -``` - -To change the variables summarized, we use the variables argument -```{r} -fit$summary(variables = c("mu", "tau")) -``` - -We can additionally change which functions are used -```{r} -fit$summary(variables = c("mu", "tau"), mean, sd) -``` - -To summarize all variables with non-default functions, it is necessary to set explicitly set the variables argument, either to `NULL` or the full vector of variable names. -```{r} -fit$metadata()$model_params -fit$summary(variables = NULL, "mean", "median") -``` - -Summary functions can be specified by character string, function, or using a -formula (or anything else supported by `rlang::as_function()`). If these -arguments are named, those names will be used in the tibble output. If the -summary results are named they will take precedence. -```{r} -my_sd <- function(x) c(My_SD = sd(x)) -fit$summary( - c("mu", "tau"), - MEAN = mean, - "median", - my_sd, - ~quantile(.x, probs = c(0.1, 0.9)), - Minimum = function(x) min(x) -) -``` - -Arguments to all summary functions can also be specified with `.args`. -```{r} -fit$summary(c("mu", "tau"), quantile, .args = list(probs = c(0.025, .05, .95, .975))) -``` - -The summary functions are applied to the array of sample values, with dimension `iter_sampling`x`chains`. -```{r} -fit$summary(variables = NULL, dim, colMeans) -``` - - -For this reason users may have unexpected results if they use `stats::var()` directly, as it will return a covariance matrix. An alternative is the `distributional::variance()` function, -which can also be accessed via `posterior::variance()`. -```{r} -fit$summary(c("mu", "tau"), posterior::variance, ~var(as.vector(.x))) -``` - -Summary functions need not be numeric, but these won't work with `$print()`. - -```{r} -strict_pos <- function(x) if (all(x > 0)) "yes" else "no" -fit$summary(variables = NULL, "Strictly Positive" = strict_pos) -# fit$print(variables = NULL, "Strictly Positive" = strict_pos) -``` - -For more information, see `posterior::summarise_draws()`, which is called by `$summary()`. - - -## Extracting posterior draws/samples - -The [`$draws()`](https://mc-stan.org/cmdstanr/reference/fit-method-draws.html) -method can be used to extract the posterior draws in formats provided by the -[**posterior**](https://mc-stan.org/posterior/) package. Here we demonstrate -only the `draws_array` and `draws_df` formats, but the **posterior** package -supports other useful formats as well. - -```{r draws, message=FALSE} -# default is a 3-D draws_array object from the posterior package -# iterations x chains x variables -draws_arr <- fit$draws() # or format="array" -str(draws_arr) - -# draws x variables data frame -draws_df <- fit$draws(format = "df") -str(draws_df) -print(draws_df) -``` - -To convert an existing draws object to a different format use the -`posterior::as_draws_*()` functions. - -To manipulate the `draws` objects use the various methods described in the -posterior package [vignettes](https://mc-stan.org/posterior/articles/index.html) -and [documentation](https://mc-stan.org/posterior/reference/index.html). - diff --git a/doc/posterior.html b/doc/posterior.html deleted file mode 100644 index 5575a9bc..00000000 --- a/doc/posterior.html +++ /dev/null @@ -1,442 +0,0 @@ - - - - - - - - - - - - - - -Working with Posteriors - - - - - - - - - - - - - - - - - - - - - - - - - - -

Working with Posteriors

- - - - -
-

Summary statistics

-

We can easily customize the summary statistics reported by -$summary() and $print().

-
fit <- cmdstanr::cmdstanr_example("schools", method = "sample")
-fit$summary()
-

By default all variables are summaries with the follow functions:

-
posterior::default_summary_measures()
-

To change the variables summarized, we use the variables argument

-
fit$summary(variables = c("mu", "tau"))
-

We can additionally change which functions are used

-
fit$summary(variables = c("mu", "tau"), mean, sd)
-

To summarize all variables with non-default functions, it is -necessary to set explicitly set the variables argument, either to -NULL or the full vector of variable names.

-
fit$metadata()$model_params
-fit$summary(variables = NULL, "mean", "median")
-

Summary functions can be specified by character string, function, or -using a formula (or anything else supported by -rlang::as_function()). If these arguments are named, those -names will be used in the tibble output. If the summary results are -named they will take precedence.

-
my_sd <- function(x) c(My_SD = sd(x))
-fit$summary(
-  c("mu", "tau"), 
-  MEAN = mean, 
-  "median",
-  my_sd,
-  ~quantile(.x, probs = c(0.1, 0.9)),
-  Minimum = function(x) min(x)
-)        
-

Arguments to all summary functions can also be specified with -.args.

-
fit$summary(c("mu", "tau"), quantile, .args = list(probs = c(0.025, .05, .95, .975)))
-

The summary functions are applied to the array of sample values, with -dimension iter_samplingxchains.

-
fit$summary(variables = NULL, dim, colMeans)
-

For this reason users may have unexpected results if they use -stats::var() directly, as it will return a covariance -matrix. An alternative is the distributional::variance() -function, which can also be accessed via -posterior::variance().

-
fit$summary(c("mu", "tau"), posterior::variance, ~var(as.vector(.x)))
-

Summary functions need not be numeric, but these won’t work with -$print().

-
strict_pos <- function(x) if (all(x > 0)) "yes" else "no"
-fit$summary(variables = NULL, "Strictly Positive" = strict_pos)
-# fit$print(variables = NULL, "Strictly Positive" = strict_pos)
-

For more information, see posterior::summarise_draws(), -which is called by $summary().

-
-
-

Extracting posterior draws/samples

-

The $draws() -method can be used to extract the posterior draws in formats provided by -the posterior -package. Here we demonstrate only the draws_array and -draws_df formats, but the posterior -package supports other useful formats as well.

-
# default is a 3-D draws_array object from the posterior package
-# iterations x chains x variables
-draws_arr <- fit$draws() # or format="array"
-str(draws_arr)
-
-# draws x variables data frame
-draws_df <- fit$draws(format = "df")
-str(draws_df)
-print(draws_df)
-

To convert an existing draws object to a different format use the -posterior::as_draws_*() functions.

-

To manipulate the draws objects use the various methods -described in the posterior package vignettes -and documentation.

-
- - - - - - - - - - - diff --git a/doc/profiling.R b/doc/profiling.R deleted file mode 100644 index 5a411341..00000000 --- a/doc/profiling.R +++ /dev/null @@ -1,106 +0,0 @@ -params <- -list(EVAL = FALSE) - -## ----settings-knitr, include=FALSE-------------------------------------------- -stopifnot(require(knitr)) -opts_chunk$set( - # collapse = TRUE, - dev = "png", - dpi = 150, - fig.asp = 0.618, - fig.width = 5, - out.width = "60%", - fig.align = "center", - comment = NA, - eval = if (isTRUE(exists("params"))) params$EVAL else FALSE -) - -## ----library, message=FALSE--------------------------------------------------- -# library(cmdstanr) -# check_cmdstan_toolchain(fix = TRUE, quiet = TRUE) - -## ----profiling_bernoulli_logit.stan------------------------------------------- -# profiling_bernoulli_logit <- write_stan_file(' -# data { -# int k; -# int n; -# matrix[n, k] X; -# array[n] int y; -# } -# parameters { -# vector[k] beta; -# real alpha; -# } -# model { -# profile("priors") { -# target += std_normal_lpdf(beta); -# target += std_normal_lpdf(alpha); -# } -# profile("likelihood") { -# target += bernoulli_logit_lpmf(y | X * beta + alpha); -# } -# } -# ') - -## ----fit-model, message=FALSE, results='hide'--------------------------------- -# # Compile the model -# model <- cmdstan_model(profiling_bernoulli_logit) -# -# # Generate some fake data -# n <- 1000 -# k <- 20 -# X <- matrix(rnorm(n * k), ncol = k) -# -# y <- 3 * X[,1] - 2 * X[,2] + 1 -# p <- runif(n) -# y <- ifelse(p < (1 / (1 + exp(-y))), 1, 0) -# stan_data <- list(k = ncol(X), n = nrow(X), y = y, X = X) -# -# # Run one chain of the model -# fit <- model$sample(data = stan_data, chains = 1) - -## ----profiles----------------------------------------------------------------- -# fit$profiles() - -## ----profiling_bernoulli_logit_glm.stan--------------------------------------- -# profiling_bernoulli_logit_glm <- write_stan_file(' -# data { -# int k; -# int n; -# matrix[n, k] X; -# array[n] int y; -# } -# parameters { -# vector[k] beta; -# real alpha; -# } -# model { -# profile("priors") { -# target += std_normal_lpdf(beta); -# target += std_normal_lpdf(alpha); -# } -# profile("likelihood") { -# target += bernoulli_logit_glm_lpmf(y | X, alpha, beta); -# } -# } -# ') - -## ----fit-model-glm, message=FALSE, results='hide'----------------------------- -# model_glm <- cmdstan_model(profiling_bernoulli_logit_glm) -# fit_glm <- model_glm$sample(data = stan_data, chains = 1) - -## ----profiles-glm------------------------------------------------------------- -# fit_glm$profiles() - -## ----per-gradient------------------------------------------------------------- -# profile_chain_1 <- fit$profiles()[[1]] -# per_gradient_timing <- profile_chain_1$total_time/profile_chain_1$autodiff_calls -# print(per_gradient_timing) # two elements for the two profile statements in the model - -## ----profile_files------------------------------------------------------------ -# fit$profile_files() - -## ----save_profile_files, eval=FALSE------------------------------------------- -# # see ?save_profile_files for info on optional arguments -# fit$save_profile_files(dir = "path/to/directory") - diff --git a/doc/profiling.Rmd b/doc/profiling.Rmd deleted file mode 100644 index 1f9eb0dc..00000000 --- a/doc/profiling.Rmd +++ /dev/null @@ -1,233 +0,0 @@ ---- -title: "Profiling Stan programs with CmdStanR" -author: "Rok Češnovar, Jonah Gabry and Ben Bales" -output: - rmarkdown::html_vignette: - toc: true - toc_depth: 4 -params: - EVAL: !r identical(Sys.getenv("NOT_CRAN"), "true") -vignette: > - %\VignetteIndexEntry{Profiling Stan programs with CmdStanR} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r child="children/_settings-knitr.Rmd"} -``` - -## Introduction - -This vignette demonstrates how to use the new profiling functionality -introduced in CmdStan 2.26.0. - -Profiling identifies which parts of a Stan program are taking the longest time -to run and is therefore a useful guide when working on optimizing the -performance of a model. - -However, be aware that the statistical assumptions that go into a model are -the most important factors in overall model performance. It is often not -possible to make up for model problems with just brute force computation. For -ideas on how to address performance of your model from a statistical -perspective, see Gelman (2020). - -```{r library, message=FALSE} -library(cmdstanr) -check_cmdstan_toolchain(fix = TRUE, quiet = TRUE) -``` - -## Adding profiling statements to a Stan program - -Consider a simple logistic regression with parameters `alpha` and `beta`, -covariates `X`, and outcome `y`. - -``` -data { - int k; - int n; - matrix[n, k] X; - array[n] int y; -} -parameters { - vector[k] beta; - real alpha; -} -model { - beta ~ std_normal(); - alpha ~ std_normal(); - - y ~ bernoulli_logit(X * beta + alpha); -} -``` - -A simple question is how much time do the prior calculations take compared -against the likelihood? To answer this we surround the prior and likelihood -calculations with `profile` statements. - -``` -profile("priors") { - target += std_normal_lpdf(beta); - target += std_normal_lpdf(alpha); -} -profile("likelihood") { - target += bernoulli_logit_lpmf(y | X * beta + alpha); -} -``` - -In general we recommend using a separate `.stan` file, but for convenience in -this vignette we'll write the Stan program as a string and use -`write_stan_file()` to write it to a temporary file. - -```{r profiling_bernoulli_logit.stan} -profiling_bernoulli_logit <- write_stan_file(' -data { - int k; - int n; - matrix[n, k] X; - array[n] int y; -} -parameters { - vector[k] beta; - real alpha; -} -model { - profile("priors") { - target += std_normal_lpdf(beta); - target += std_normal_lpdf(alpha); - } - profile("likelihood") { - target += bernoulli_logit_lpmf(y | X * beta + alpha); - } -} -') -``` - -We can then run the model as usual and Stan will collect the profiling -information for any sections with `profile` statements. - -```{r fit-model, message=FALSE, results='hide'} -# Compile the model -model <- cmdstan_model(profiling_bernoulli_logit) - -# Generate some fake data -n <- 1000 -k <- 20 -X <- matrix(rnorm(n * k), ncol = k) - -y <- 3 * X[,1] - 2 * X[,2] + 1 -p <- runif(n) -y <- ifelse(p < (1 / (1 + exp(-y))), 1, 0) -stan_data <- list(k = ncol(X), n = nrow(X), y = y, X = X) - -# Run one chain of the model -fit <- model$sample(data = stan_data, chains = 1) -``` - -## Accessing the profiling information from R - -The raw profiling information can then be accessed with the `$profiles()` -method, which returns a list containing one data frame per chain (profiles -across multiple chains are not automatically aggregated). Details on the column -names are available in the -[CmdStan documentation](https://mc-stan.org/docs/2_26/cmdstan-guide/stan-csv.html#profiling-csv-output-file). - -```{r profiles} -fit$profiles() -``` - -The `total_time` column is the total time spent inside a given profile -statement. It is clear that the vast majority of time is spent in the likelihood -function. - -## Comparing to a faster version of the model - -Stan's specialized glm functions can be used to make models like this faster. In -this case the likelihood can be replaced with - -``` -target += bernoulli_logit_glm_lpmf(y | X, alpha, beta); -``` - -We'll keep the same `profile()` statements so that the profiling information for -the new model is collected automatically just like for the previous one. - -```{r profiling_bernoulli_logit_glm.stan} -profiling_bernoulli_logit_glm <- write_stan_file(' -data { - int k; - int n; - matrix[n, k] X; - array[n] int y; -} -parameters { - vector[k] beta; - real alpha; -} -model { - profile("priors") { - target += std_normal_lpdf(beta); - target += std_normal_lpdf(alpha); - } - profile("likelihood") { - target += bernoulli_logit_glm_lpmf(y | X, alpha, beta); - } -} -') -``` - -```{r fit-model-glm, message=FALSE, results='hide'} -model_glm <- cmdstan_model(profiling_bernoulli_logit_glm) -fit_glm <- model_glm$sample(data = stan_data, chains = 1) -``` - -```{r profiles-glm} -fit_glm$profiles() -``` - -We can see from the `total_time` column that this is much faster than the -previous model. - -## Per-gradient timings, and memory usage - -The other columns of the profiling output are documented in the -[CmdStan documentation](https://mc-stan.org/docs/2_26/cmdstan-guide/stan-csv.html#profiling-csv-output-file). - -The timing numbers are broken down by forward pass and reverse pass, and the -`chain_stack` and `no_chain_stack` columns contain information about how many -autodiff variables were saved in the process of performing a calculation. - -These numbers are all totals -- times are the total times over the whole -calculation, and `chain_stack` counts are similarly the total counts of autodiff -variables used over the whole calculation. It is often convenient to have -per-gradient calculations (which will be more stable across runs with different -seeds). To compute these, use the `autodiff_calls` column. - -```{r per-gradient} -profile_chain_1 <- fit$profiles()[[1]] -per_gradient_timing <- profile_chain_1$total_time/profile_chain_1$autodiff_calls -print(per_gradient_timing) # two elements for the two profile statements in the model -``` - -## Accessing and saving the profile files - -After sampling (or optimization or variational inference) finishes, CmdStan stores -the profiling data in CSV files in a temporary location. -The paths of the profiling CSV files can be retrieved using `$profile_files()`. - -```{r profile_files} -fit$profile_files() -``` - -These can be saved to a more permanent location with the `$save_profile_files()` -method. - -```{r save_profile_files, eval=FALSE} -# see ?save_profile_files for info on optional arguments -fit$save_profile_files(dir = "path/to/directory") -``` - -# References - -Gelman, Andrew, Aki Vehtari, Daniel Simpson, Charles C. Margossian, Bob -Carpenter, Yuling Yao, Lauren Kennedy, Jonah Gabry, Paul-Christian Bürkner, and -Martin Modrák. 2020. "Bayesian Workflow." https://arxiv.org/abs/2011.01808. diff --git a/doc/profiling.html b/doc/profiling.html deleted file mode 100644 index 6a95413b..00000000 --- a/doc/profiling.html +++ /dev/null @@ -1,552 +0,0 @@ - - - - - - - - - - - - - - - -Profiling Stan programs with CmdStanR - - - - - - - - - - - - - - - - - - - - - - - - - - -

Profiling Stan programs with CmdStanR

-

Rok Češnovar, Jonah Gabry and Ben Bales

- - - - -
-

Introduction

-

This vignette demonstrates how to use the new profiling functionality -introduced in CmdStan 2.26.0.

-

Profiling identifies which parts of a Stan program are taking the -longest time to run and is therefore a useful guide when working on -optimizing the performance of a model.

-

However, be aware that the statistical assumptions that go into a -model are the most important factors in overall model performance. It is -often not possible to make up for model problems with just brute force -computation. For ideas on how to address performance of your model from -a statistical perspective, see Gelman (2020).

-
library(cmdstanr)
-check_cmdstan_toolchain(fix = TRUE, quiet = TRUE)
-
-
-

Adding profiling statements to a Stan program

-

Consider a simple logistic regression with parameters -alpha and beta, covariates X, and -outcome y.

-
data {
-  int<lower=1> k;
-  int<lower=0> n;
-  matrix[n, k] X;
-  array[n] int y;
-}
-parameters {
-  vector[k] beta;
-  real alpha;
-}
-model {
-  beta ~ std_normal();
-  alpha ~ std_normal();
-
-  y ~ bernoulli_logit(X * beta + alpha);
-}
-

A simple question is how much time do the prior calculations take -compared against the likelihood? To answer this we surround the prior -and likelihood calculations with profile statements.

-
profile("priors") {
-  target += std_normal_lpdf(beta);
-  target += std_normal_lpdf(alpha);
-}
-profile("likelihood") {
-  target += bernoulli_logit_lpmf(y | X * beta + alpha);
-}
-

In general we recommend using a separate .stan file, but -for convenience in this vignette we’ll write the Stan program as a -string and use write_stan_file() to write it to a temporary -file.

-
profiling_bernoulli_logit <- write_stan_file('
-data {
-  int<lower=1> k;
-  int<lower=0> n;
-  matrix[n, k] X;
-  array[n] int y;
-}
-parameters {
-  vector[k] beta;
-  real alpha;
-}
-model {
-  profile("priors") {
-    target += std_normal_lpdf(beta);
-    target += std_normal_lpdf(alpha);
-  }
-  profile("likelihood") {
-    target += bernoulli_logit_lpmf(y | X * beta + alpha);
-  }
-}
-')
-

We can then run the model as usual and Stan will collect the -profiling information for any sections with profile -statements.

-
# Compile the model
-model <- cmdstan_model(profiling_bernoulli_logit)
-
-# Generate some fake data
-n <- 1000
-k <- 20
-X <- matrix(rnorm(n * k), ncol = k)
-
-y <- 3 * X[,1] - 2 * X[,2] + 1
-p <- runif(n)
-y <- ifelse(p < (1 / (1 + exp(-y))), 1, 0)
-stan_data <- list(k = ncol(X), n = nrow(X), y = y, X = X)
-
-# Run one chain of the model
-fit <- model$sample(data = stan_data, chains = 1)
-
-
-

Accessing the profiling information from R

-

The raw profiling information can then be accessed with the -$profiles() method, which returns a list containing one -data frame per chain (profiles across multiple chains are not -automatically aggregated). Details on the column names are available in -the CmdStan -documentation.

-
fit$profiles()
-

The total_time column is the total time spent inside a -given profile statement. It is clear that the vast majority of time is -spent in the likelihood function.

-
-
-

Comparing to a faster version of the model

-

Stan’s specialized glm functions can be used to make models like this -faster. In this case the likelihood can be replaced with

-
target += bernoulli_logit_glm_lpmf(y | X, alpha, beta);
-

We’ll keep the same profile() statements so that the -profiling information for the new model is collected automatically just -like for the previous one.

-
profiling_bernoulli_logit_glm <- write_stan_file('
-data {
-  int<lower=1> k;
-  int<lower=0> n;
-  matrix[n, k] X;
-  array[n] int y;
-}
-parameters {
-  vector[k] beta;
-  real alpha;
-}
-model {
-  profile("priors") {
-    target += std_normal_lpdf(beta);
-    target += std_normal_lpdf(alpha);
-  }
-  profile("likelihood") {
-    target += bernoulli_logit_glm_lpmf(y | X, alpha, beta);
-  }
-}
-')
-
model_glm <- cmdstan_model(profiling_bernoulli_logit_glm)
-fit_glm <- model_glm$sample(data = stan_data, chains = 1)
-
fit_glm$profiles()
-

We can see from the total_time column that this is much -faster than the previous model.

-
-
-

Per-gradient timings, and memory usage

-

The other columns of the profiling output are documented in the CmdStan -documentation.

-

The timing numbers are broken down by forward pass and reverse pass, -and the chain_stack and no_chain_stack columns -contain information about how many autodiff variables were saved in the -process of performing a calculation.

-

These numbers are all totals – times are the total times over the -whole calculation, and chain_stack counts are similarly the -total counts of autodiff variables used over the whole calculation. It -is often convenient to have per-gradient calculations (which will be -more stable across runs with different seeds). To compute these, use the -autodiff_calls column.

-
profile_chain_1 <- fit$profiles()[[1]]
-per_gradient_timing <- profile_chain_1$total_time/profile_chain_1$autodiff_calls
-print(per_gradient_timing) # two elements for the two profile statements in the model
-
-
-

Accessing and saving the profile files

-

After sampling (or optimization or variational inference) finishes, -CmdStan stores the profiling data in CSV files in a temporary location. -The paths of the profiling CSV files can be retrieved using -$profile_files().

-
fit$profile_files()
-

These can be saved to a more permanent location with the -$save_profile_files() method.

-
# see ?save_profile_files for info on optional arguments
-fit$save_profile_files(dir = "path/to/directory")
-
-
-

References

-

Gelman, Andrew, Aki Vehtari, Daniel Simpson, Charles C. Margossian, -Bob Carpenter, Yuling Yao, Lauren Kennedy, Jonah Gabry, Paul-Christian -Bürkner, and Martin Modrák. 2020. “Bayesian Workflow.” https://arxiv.org/abs/2011.01808.

-
- - - - - - - - - - - diff --git a/doc/r-markdown.R b/doc/r-markdown.R deleted file mode 100644 index 77fa734c..00000000 --- a/doc/r-markdown.R +++ /dev/null @@ -1,28 +0,0 @@ -params <- -list(EVAL = FALSE) - -## ----settings-knitr, include = FALSE------------------------------------------ -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - eval = if (isTRUE(exists("params"))) params$EVAL else FALSE -) - -## ----register-engine, message=FALSE------------------------------------------- -# library(cmdstanr) -# register_knitr_engine(override = TRUE) - -## ----print-ex1---------------------------------------------------------------- -# ex1$print() - -## ----fit-ex1------------------------------------------------------------------ -# fit <- ex1$sample( -# refresh = 0, -# seed = 42L -# ) -# -# print(fit) - -## ----register-engine-no-override---------------------------------------------- -# register_knitr_engine(override = FALSE) - diff --git a/doc/r-markdown.Rmd b/doc/r-markdown.Rmd deleted file mode 100644 index ec03831e..00000000 --- a/doc/r-markdown.Rmd +++ /dev/null @@ -1,157 +0,0 @@ ---- -title: "R Markdown CmdStan Engine" -author: "Mikhail Popov" -output: - rmarkdown::html_vignette: - toc: true -params: - EVAL: !r identical(Sys.getenv("NOT_CRAN"), "true") -vignette: > - %\VignetteIndexEntry{R Markdown CmdStan Engine} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r settings-knitr, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - eval = if (isTRUE(exists("params"))) params$EVAL else FALSE -) -``` - -R Markdown supports a variety of languages through the use of knitr language -engines. Where users wish to write Stan programs as chunks directly in R Markdown documents there are three options: - -1. the user wishes all the Stan chunks in the R Markdown document to be processed using RStan; -2. all Stan chunks are to be processed using CmdStanR; and. -3. some chunks are to be processed by RStan and some by CmdStanR. - -Behind the scenes in each option, the engine compiles the model code in each -chunk and creates an object that provides methods to run the model: a -`stanmodel` if Rstan is being used, or a `CmdStanModel` in the CmdStanR case. -This model object is assigned to a variable with the name given by the -`output.var` chunk option. - -## Option 1: Using RStan for all chunks - -This is the default option. In that case we can write, for example: - -````{verbatim} -```{stan, output.var="model"} -// Stan model code -``` - -```{r} -rstan::sampling(model) -``` -```` - -## Option 2: Using CmdStanR for all chunks - -If CmdStanR is being used a replacement engine needs to be registered along the following lines: - -```{r register-engine, message=FALSE} -library(cmdstanr) -register_knitr_engine(override = TRUE) -``` - -This overrides knitr's built-in `stan` engine so that all `stan` -chunks are processed with CmdStanR, not RStan. Of course, this also means that -the variable specified by `output.var` will no longer be a `stanmodel` object, -but instead a `CmdStanModel` object, so the example code above would look like this: - -````{verbatim} -```{stan, output.var="model"} -// Stan model code -``` - -```{r} -model$sample() -``` -```` - -## Example - -```{stan ex1, output.var="ex1"} -// This stan chunk results in a CmdStanModel object called "ex1" -parameters { - array[2] real y; -} -model { - y[1] ~ normal(0, 1); - y[2] ~ double_exponential(0, 2); -} -``` - -```{r print-ex1} -ex1$print() -``` - -```{r fit-ex1} -fit <- ex1$sample( - refresh = 0, - seed = 42L -) - -print(fit) -``` - - -## Option 3: Using both RStan and CmdStanR in the same R Markdown document - -While the default behavior is to override the built-in `stan` engine because the -assumption is that the user is probably not using both RStan and CmdStanR in the -same document or project, the option to use both exists. When registering -CmdStanR's knitr engine, set `override = FALSE` to register the engine as a -`cmdstan` engine: - -```{r register-engine-no-override} -register_knitr_engine(override = FALSE) -``` - -This will cause `stan` chunks to be processed by knitr's built-in, RStan-based -engine and only use CmdStanR's knitr engine for `cmdstan` chunks: - -````{verbatim} -```{stan, output.var="model_obj1"} -// Results in a stanmodel object from RStan -``` - -```{r} -rstan::sampling(model_obj1) -``` - -```{cmdstan, output.var="model_obj2"} -// Results in a CmdStanModel object from CmdStanR -``` - -```{r} -model_obj2$sample() -``` -```` - - -## Caching chunks - -Use `cache=TRUE` chunk option to avoid re-compiling the Stan model code every -time the R Markdown is knit/rendered. - -You can find the Stan model file and the compiled executable in the document's -cache directory. - - - -## Running interactively - -When running chunks interactively in RStudio (e.g. when using -[R Notebooks](https://bookdown.org/yihui/rmarkdown/notebook.html)), it has been -observed that the built-in, RStan-based engine is used for `stan` chunks even -when CmdStanR's engine has been registered in the session as the engine for -`stan`. As a workaround, when running chunks *interactively*, it is recommended -to use the `override = FALSE` option and change `stan` chunks to be `cmdstan` -chunks. - -Do not worry: if the template you use supports syntax highlighting for the Stan -language, that syntax highlighting will be applied to `cmdstan` chunks when the -document is knit/rendered. diff --git a/doc/r-markdown.html b/doc/r-markdown.html deleted file mode 100644 index b0444ef3..00000000 --- a/doc/r-markdown.html +++ /dev/null @@ -1,493 +0,0 @@ - - - - - - - - - - - - - - - -R Markdown CmdStan Engine - - - - - - - - - - - - - - - - - - - - - - - - - - -

R Markdown CmdStan Engine

-

Mikhail Popov

- - - - -

R Markdown supports a variety of languages through the use of knitr -language engines. Where users wish to write Stan programs as chunks -directly in R Markdown documents there are three options:

-
    -
  1. the user wishes all the Stan chunks in the R Markdown document to be -processed using RStan;
    -
  2. -
  3. all Stan chunks are to be processed using CmdStanR; and.
    -
  4. -
  5. some chunks are to be processed by RStan and some by CmdStanR.
  6. -
-

Behind the scenes in each option, the engine compiles the model code -in each chunk and creates an object that provides methods to run the -model: a stanmodel if Rstan is being used, or a -CmdStanModel in the CmdStanR case. This model object is -assigned to a variable with the name given by the -output.var chunk option.

-
-

Option 1: Using RStan for all chunks

-

This is the default option. In that case we can write, for -example:

-
```{stan, output.var="model"}
-// Stan model code
-```
-
-```{r}
-rstan::sampling(model)
-```
-
-
-

Option 2: Using CmdStanR for all chunks

-

If CmdStanR is being used a replacement engine needs to be registered -along the following lines:

-
library(cmdstanr)
-register_knitr_engine(override = TRUE)
-

This overrides knitr’s built-in stan engine so that all -stan chunks are processed with CmdStanR, not RStan. Of -course, this also means that the variable specified by -output.var will no longer be a stanmodel -object, but instead a CmdStanModel object, so the example -code above would look like this:

-
```{stan, output.var="model"}
-// Stan model code
-```
-
-```{r}
-model$sample()
-```
-
-
-

Example

-
// This stan chunk results in a CmdStanModel object called "ex1"
-parameters {
-  array[2] real y;
-}
-model {
-  y[1] ~ normal(0, 1);
-  y[2] ~ double_exponential(0, 2);
-}
-
ex1$print()
-
fit <- ex1$sample(
-  refresh = 0,
-  seed = 42L
-)
-
-print(fit)
-
-
-

Option 3: Using both RStan and CmdStanR in the same R Markdown -document

-

While the default behavior is to override the built-in -stan engine because the assumption is that the user is -probably not using both RStan and CmdStanR in the same document or -project, the option to use both exists. When registering CmdStanR’s -knitr engine, set override = FALSE to register the engine -as a cmdstan engine:

-
register_knitr_engine(override = FALSE)
-

This will cause stan chunks to be processed by knitr’s -built-in, RStan-based engine and only use CmdStanR’s knitr engine for -cmdstan chunks:

-
```{stan, output.var="model_obj1"}
-// Results in a stanmodel object from RStan
-```
-
-```{r}
-rstan::sampling(model_obj1)
-```
-
-```{cmdstan, output.var="model_obj2"}
-// Results in a CmdStanModel object from CmdStanR
-```
-
-```{r}
-model_obj2$sample()
-```
-
-
-

Caching chunks

-

Use cache=TRUE chunk option to avoid re-compiling the -Stan model code every time the R Markdown is knit/rendered.

-

You can find the Stan model file and the compiled executable in the -document’s cache directory.

-
-
-

Running interactively

-

When running chunks interactively in RStudio (e.g. when using R -Notebooks), it has been observed that the built-in, RStan-based -engine is used for stan chunks even when CmdStanR’s engine -has been registered in the session as the engine for stan. -As a workaround, when running chunks interactively, it is -recommended to use the override = FALSE option and change -stan chunks to be cmdstan chunks.

-

Do not worry: if the template you use supports syntax highlighting -for the Stan language, that syntax highlighting will be applied to -cmdstan chunks when the document is knit/rendered.

-
- - - - - - - - - - -