diff --git a/cmdstanpy/cmdstan_args.py b/cmdstanpy/cmdstan_args.py index 0054fdce..a586f02c 100644 --- a/cmdstanpy/cmdstan_args.py +++ b/cmdstanpy/cmdstan_args.py @@ -31,6 +31,7 @@ class Method(Enum): VARIATIONAL = auto() LAPLACE = auto() PATHFINDER = auto() + DIAGNOSE = auto() def __repr__(self) -> str: return '<%s.%s>' % (self.__class__.__name__, self.name) @@ -736,6 +737,47 @@ def compose(self, idx: int, cmd: List[str]) -> List[str]: return cmd +class DiagnoseArgs: + """Arguments needed for diagnostics method.""" + + def __init__( + self, + test: Optional[str] = None, + epsilon: Optional[float] = None, + error: Optional[float] = None, + ) -> None: + self.test = test + self.epsilon = epsilon + self.error = error + + def validate( + self, chains: Optional[int] = None # pylint: disable=unused-argument + ) -> None: + """ + Check argument correctness and consistency. + """ + if self.test is not None and self.test != "gradient": + raise ValueError("Only testing gradient is supported.") + + positive_float(self.epsilon, 'epsilon') + positive_float(self.error, 'error') + + # pylint: disable=unused-argument + def compose(self, idx: int, cmd: List[str]) -> List[str]: + """ + Compose CmdStan command for method-specific non-default arguments. + """ + cmd.append('method=diagnose') + cmd.append('test=gradient') + if self.test: + cmd.append(f'test={self.test}') + if self.epsilon is not None: + cmd.append(f'epsilon={self.epsilon}') + if self.error is not None: + cmd.append(f'error={self.error}') + return cmd + + class CmdStanArgs: """ Container for CmdStan command line arguments. @@ -755,6 +797,7 @@ def __init__( VariationalArgs, LaplaceArgs, PathfinderArgs, + DiagnoseArgs, ], data: Union[Mapping[str, Any], str, None] = None, seed: Union[int, List[int], None] = None, @@ -790,6 +833,8 @@ def __init__( self.method = Method.LAPLACE elif isinstance(method_args, PathfinderArgs): self.method = Method.PATHFINDER + elif isinstance(method_args, DiagnoseArgs): + self.method = Method.DIAGNOSE else: raise ValueError( 'Unsupported method args type: {}'.format(type(method_args)) diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py index 17be3b5e..d1287aef 100644 --- a/cmdstanpy/model.py +++ b/cmdstanpy/model.py @@ -38,6 +38,7 @@ ) from cmdstanpy.cmdstan_args import ( CmdStanArgs, + DiagnoseArgs, GenerateQuantitiesArgs, LaplaceArgs, Method, @@ -53,6 +54,7 @@ CmdStanMLE, CmdStanPathfinder, CmdStanVB, + CmdStanDiagnose, RunSet, from_csv, ) @@ -2203,3 +2205,117 @@ def progress_hook(line: str, idx: int) -> None: pbars[idx].postfix[0]["value"] = mline return progress_hook + + def diagnose( + self, + data: Union[Mapping[str, Any], str, os.PathLike, None] = None, + seed: Optional[int] = None, + inits: Optional[float] = None, + output_dir: OptionalPath = None, + sig_figs: Optional[int] = None, + show_console: bool = False, + time_fmt: str = "%Y%m%d%H%M%S", + timeout: Optional[float] = None, + epsilon: Optional[float] = None, + error: Optional[float] = None, + require_gradients_ok: bool = True, + ) -> CmdStanDiagnose: + """ + Run diagnostics to calculate the gradients of the initial state and + compare them with gradients calculated by finite differences. + Discrepancies between the two indicate that there is a problem with the + model or initial states or else there is a bug in Stan. + + :param data: Values for all data variables in the model, specified + either as a dictionary with entries matching the data variables, + or as the path of a data file in JSON or Rdump format. + + :param seed: The seed for random number generator. Must be an integer + between 0 and 2^32 - 1. If unspecified, + :func:`numpy.random.default_rng` is used to generate a seed. + + :param inits: Specifies how the sampler initializes parameter values. + Initialization is either uniform random on a range centered on 0, + exactly 0, or a dictionary or file of initial values for some or + all parameters in the model. The default initialization behavior + will initialize all parameter values on range [-2, 2] on the + *unconstrained* support. The following value types are allowed: + + * Single number, n > 0 - initialization range is [-n, n]. + * 0 - all parameters are initialized to 0. + * dictionary - pairs parameter name : initial value. + * string - pathname to a JSON or Rdump data file. + + :param output_dir: Name of the directory to which CmdStan output + files are written. If unspecified, output files will be written + to a temporary directory which is deleted upon session exit. + + :param sig_figs: Numerical precision used for output CSV and text files. + Must be an integer between 1 and 18. If unspecified, the default + precision for the system file I/O is used; the usual value is 6. + Introduced in CmdStan-2.25. + + :param show_console: If ``True``, stream CmdStan messages sent to stdout + and stderr to the console. Default is ``False``. + + :param time_fmt: A format string passed to + :meth:`~datetime.datetime.strftime` to decide the file names for + output CSVs. Defaults to "%Y%m%d%H%M%S" + + :param timeout: Duration at which the diagnostic command times out in + seconds. Defaults to None. + + :param epsilon: Step size for finite difference gradients. + + :param error: Absolute error threshold for comparing autodiff and finite + difference gradients. + + :param require_gradients_ok: Whether or not to raise an error if Stan + reports that the difference between autodiff gradients and finite + difference gradients exceed the error threshold. + + :return: A :class:`CmdStanDiagnose` object. + """ + diagnose_args = DiagnoseArgs( + epsilon=epsilon, + error=error, + ) + + with temp_single_json(data) as _data, temp_inits(inits) as _inits: + args = CmdStanArgs( + model_name=self._name, + model_exe=self._exe_file, + chain_ids=None, + method_args=diagnose_args, + data=_data, + seed=seed, + inits=_inits, + output_dir=output_dir, + sig_figs=sig_figs, + ) + + dummy_chain_id = 0 + runset = RunSet(args=args, time_fmt=time_fmt) + self._run_cmdstan( + runset, + dummy_chain_id, + show_console=show_console, + timeout=timeout, + ) + runset.raise_for_timeouts() + + if not runset._check_retcodes(): + if require_gradients_ok: + raise RuntimeError( + "The difference between autodiff and finite difference " + "gradients may exceed the error threshold. If you would " + "like to inspect the output, re-call with " + "`require_gradients_ok=False`." + ) + get_logger().warning( + "The difference between autodiff and finite difference " + "gradients may exceed the error threshold. Proceeding because " + "`require_gradients_ok` is set to `False`." + ) + + return CmdStanDiagnose(runset) diff --git a/cmdstanpy/stanfit/__init__.py b/cmdstanpy/stanfit/__init__.py index 50764a30..baf02704 100644 --- a/cmdstanpy/stanfit/__init__.py +++ b/cmdstanpy/stanfit/__init__.py @@ -22,6 +22,7 @@ from .pathfinder import CmdStanPathfinder from .runset import RunSet from .vb import CmdStanVB +from .diagnose import CmdStanDiagnose __all__ = [ "RunSet", @@ -32,6 +33,7 @@ "CmdStanGQ", "CmdStanLaplace", "CmdStanPathfinder", + "CmdStanDiagnose", ] diff --git a/cmdstanpy/stanfit/diagnose.py b/cmdstanpy/stanfit/diagnose.py new file mode 100644 index 00000000..cbd0f552 --- /dev/null +++ b/cmdstanpy/stanfit/diagnose.py @@ -0,0 +1,87 @@ +import io +from typing import Optional + +import pandas as pd +import re + +from ..cmdstan_args import Method +from ..utils import scan_config + +from .runset import RunSet + + +class CmdStanDiagnose: + """ + Container for outputs from CmdStan diagnostics. Created by + :meth:`CmdStanModel.diagnose`. + """ + + def __init__(self, runset: RunSet) -> None: + if not runset.method == Method.DIAGNOSE: + raise ValueError( + "Wrong runset method, expecting diagnose runset, found method " + f"{runset.method}." + ) + self.runset = runset + self.gradients_ok = runset._check_retcodes() + + # Split the csv into header and gradient table parts. + with open(self.runset.csv_files[0]) as fp: + text = fp.read() + header, table = re.split(r"#\s+Log probability=.*", text, re.M) + self.config = {} + scan_config(io.StringIO(header), self.config, 0) + + # Remove comment characters, leading whitespace, and empty lines. + lines = [] + for line in table.splitlines(): + line = re.sub(r"^#\s+", "", line) + if not line: + continue + # If this is the first line, remove whitespace from column names. + if not lines: + line = ( + line + .replace("param idx", "param_idx") + .replace("finite diff", "finite_diff") + ) + lines.append(line) + self._gradients = pd.read_csv(io.StringIO("\n".join(lines)), sep=r"\s+") + + def __repr__(self) -> str: + cmd = self.runset._args.method_args.compose(0, cmd=[]) + lines = [ + f"CmdStanDiagnose: model={self.runset.model}{cmd}", + f"\tcsv_file: {self.runset.csv_files[0]}", + f"\toutput_file: {self.runset.stdout_files[0]}", + ] + if not self.gradients_ok: + lines.append( + "Warning: autodiff and finite difference gradients may not " + "agree." + ) + return "\n".join(lines) + + @property + def gradients(self) -> pd.DataFrame: + """ + Dataframe of parameter names, autodiff gradients, finite difference + gradients and the delta between the two gradient estimates. + """ + return self._gradients + + def save_csvfiles(self, dir: Optional[str] = None) -> None: + """ + Move output CSV files to specified directory. If files were + written to the temporary session directory, clean filename. + E.g., save 'bernoulli-201912081451-1-5nm6as7u.csv' as + 'bernoulli-201912081451-1.csv'. + + :param dir: directory path + + See Also + -------- + stanfit.RunSet.save_csvfiles + cmdstanpy.from_csv + """ + self.runset.save_csvfiles(dir) diff --git a/test/test_model.py b/test/test_model.py index 55c2bfe7..b01ab951 100644 --- a/test/test_model.py +++ b/test/test_model.py @@ -3,6 +3,7 @@ import contextlib import io import logging +import numpy as np import os import re import shutil @@ -593,3 +594,32 @@ def test_format_old_version() -> None: model.format(max_line_length=88) model.format(canonicalize=True) + + +def test_diagnose(): + # Check the gradients. + model = CmdStanModel(stan_file=BERN_STAN) + result = model.diagnose(data=BERN_DATA) + + # Check we have the right columns. + assert set(result.gradients) == { + "param_idx", + "value", + "model", + "finite_diff", + "error", + } + assert result.gradients_ok + + # Simulate bad gradients by using large finite difference. + with pytest.raises(RuntimeError, match="may exceed the error threshold"): + model.diagnose(data=BERN_DATA, epsilon=3) + + # Check we get the results if we set require_gradients_ok=False. + result = model.diagnose( + data=BERN_DATA, + epsilon=3, + require_gradients_ok=False, + ) + assert np.abs(result.gradients["error"]).max() > 1e-3 + assert not result.gradients_ok