From 0e0d0d5e567356ef8cb77370cd72e61005dca8c0 Mon Sep 17 00:00:00 2001 From: Matt Epland Date: Sun, 26 May 2024 21:21:48 -0400 Subject: [PATCH] Manually write bad points (#68) * Create a script to manually write out a bad point This is useful when a Bayesian optimization iteration is killed by the OS with an uncatchable SIGKILL. The user can then easily write out the point with BAD_TARGET and BAD_METRICS. * Add manual_bad_point model_name for those added via write_manual_bad_point Exclude from mean and std time minutes_elapsed Fix bug, BayesianOptimization needs hyperparam names in bounds * combine lines * lint * Add NHiTSModel bad point * lint * Another bad point for NHiTS * Added section to README * edits --- README.md | 20 ++++- ana/manual_bad_point.py | 167 ++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + utils/bayesian_opt.py | 86 ++++++++++++++++++++- 4 files changed, 270 insertions(+), 4 deletions(-) create mode 100644 ana/manual_bad_point.py diff --git a/README.md b/README.md index 8f9b77a5..0e489dd6 100644 --- a/README.md +++ b/README.md @@ -236,7 +236,9 @@ both of the individual forecasting models themselves as well as how the data is [Bayesian optimization](https://github.com/mepland/data_science_notes) was used to efficiently sample the parameter space. The functions needed to run Bayesian optimization -are located in [`bayesian_opt.py`](utils/bayesian_opt.py). +are located in [`bayesian_opt.py`](utils/bayesian_opt.py), +and make use of the +[`bayesian-optimization`](https://github.com/bayesian-optimization/BayesianOptimization) library. Unfortunately, actually running the optimization over GPU accelerated models is not as simple as calling the `run_bayesian_opt()` function. @@ -247,7 +249,7 @@ leading to out of GPU memory errors, even when [using commands like `gc.collect()` and `torch.cuda.empty_cache()`](https://stackoverflow.com/questions/70508960/how-to-free-gpu-memory-in-pytorch). The `torch` models created by `darts` are very convenient, but do not provide as much configurability as building your own `torch` model from scratch, -leading me unable to fix this issue in a clean way. +leaving me unable to fix this issue in a clean way. To work around the GPU memory issues, a shell script, [`start_bayesian_opt`](ana/start_bayesian_opt), is used to repeatedly call `run_bayesian_opt()` @@ -257,12 +259,26 @@ totally clearing memory between training iterations. A signed pickle file is used to quickly load the necessary data and settings on each iteration. Instructions for running the whole Bayesian optimization workflow are provided below. +Some hyperparameter points chosen for testing by the optimizer result in crashes during training, +either due to memory limitations, or invalid parameter combinations that slip by pre-run checks. +In most cases these exceptions can be caught within Python, +allowing the point to be automatically logged as having the worst possible `BAD_TARGET = -999.0`. +However, the Python process itself is occasionally killed +by the operating system with an uncatchable `SIGKILL` signal, +likely due to a request for too much memory. +Resuming the run will only result in repeated crashes on the same point as it is never written to disk. +Rather than rework the `bayesian-optimization` library to pre-register points before optimization, +a [`manual_bad_point.py`](ana/manual_bad_point.py) script is included +to easily log the few such points by hand. + ### Running Bayesian Optimization 1. Create the input `parent_wrapper.pickle` file for `bayesian_opt_runner.py` via the `exploratory_ana.py` notebook. 2. Configure the run in `start_bayesian_opt` and `bayesian_opt_runner.py`. 3. Run the shell script, logging outputs to disk via: + * Log any bad points that are killed by the operating system +with `manual_bad_point.py` and resume the search. ```bash ./ana/start_bayesian_opt 2>&1 | tee ana/models/bayesian_optimization/bayesian_opt.log diff --git a/ana/manual_bad_point.py b/ana/manual_bad_point.py new file mode 100644 index 00000000..45eeb097 --- /dev/null +++ b/ana/manual_bad_point.py @@ -0,0 +1,167 @@ +"""Standalone script to execute write_manual_bad_point.""" + +import pathlib +import pprint +import sys +from typing import Final + +import hydra +from omegaconf import DictConfig # noqa: TC002 + +sys.path.append(str(pathlib.Path(__file__).resolve().parents[1])) + +# pylint: disable=import-error,useless-suppression,duplicate-code +# pylint: enable=useless-suppression +from utils.shared_functions import read_secure_pickle + +# isort: off +from utils.bayesian_opt import write_manual_bad_point + +# PyTorch NN Models +# from TSModelWrappers.NBEATSModelWrapper import NBEATSModelWrapper +from TSModelWrappers.NHiTSModelWrapper import NHiTSModelWrapper + +# isort: on +# pylint: enable=import-error + +__all__: list[str] = [] + + +@hydra.main(version_base=None, config_path="..", config_name="config") +def run_write_manual_bad_point( + cfg: DictConfig, +) -> None: + """Run the write_manual_bad_point script. + + Args: + cfg (DictConfig): Hydra configuration. + """ + # Setup variables + # pylint: disable=invalid-name + PACKAGE_PATH: Final = pathlib.Path(cfg["general"]["package_path"]).expanduser() + MODELS_PATH: Final = PACKAGE_PATH / "ana" / "models" + BAYESIAN_OPT_WORK_DIR_NAME: Final = "bayesian_optimization" + + # Load PARENT_WRAPPER from pickle + PARENT_WRAPPER_PATH: Final = MODELS_PATH / BAYESIAN_OPT_WORK_DIR_NAME / "parent_wrapper.pickle" + PARENT_WRAPPER: Final = read_secure_pickle(PARENT_WRAPPER_PATH) + # pylint: enable=invalid-name + + if PARENT_WRAPPER is None: + print(f"Failed to load PARENT_WRAPPER from {PARENT_WRAPPER_PATH}!") + sys.exit(3) + + # Manually specify bad points and model + + # model_wrapper_class = NBEATSModelWrapper + # bad_point_to_write = { + # "batch_size": 182.67288601975548, + # "covariates_to_use": 4.0, + # "dropout": 0.15, + # "expansion_coefficient_dim": 10.0, + # "input_chunk_length": 1.0, + # "layer_widths": 845.7812745971257, + # "num_blocks": 10.0, + # "num_layers": 10.0, + # "num_stacks": 50.0, + # "time_bin_size_in_minutes": 20.0, + # "y_presentation": 2.0, + # } + # bad_point_to_write_clean = { + # "batch_size": 182, + # "covariates_to_use": 4, + # "dropout": 0.15, + # "expansion_coefficient_dim": 10, + # "input_chunk_length": 1, + # "layer_widths": 845, + # "num_blocks": 10, + # "num_layers": 10, + # "num_stacks": 50, + # "time_bin_size_in_minutes": 20, + # "y_presentation": 2, + # } + + model_wrapper_class = NHiTSModelWrapper + # bad_point_to_write = { + # "MaxPool1d": 0.0, + # "batch_size": 955.0581345768601, + # "covariates_to_use": 4.0, + # "dropout": 0.0, + # "input_chunk_length": 60.0, + # "layer_widths": 719.959976362605, + # "num_blocks": 10.0, + # "num_layers": 10.0, + # "num_stacks": 50.0, + # "time_bin_size_in_minutes": 20.0, + # "y_presentation": 2.0, + # } + # bad_point_to_write_clean = { + # "MaxPool1d": False, + # "batch_size": 955, + # "covariates_to_use": 4, + # "dropout": 0.0, + # "input_chunk_length": 60, + # "layer_widths": 719, + # "num_blocks": 10, + # "num_layers": 10, + # "num_stacks": 50, + # "time_bin_size_in_minutes": 20, + # "y_presentation": 2, + # } + + bad_point_to_write = { + "MaxPool1d": 0.5326385245470463, + "batch_size": 770.1273533676639, + "covariates_to_use": 0.8837601704704117, + "dropout": 0.012044213327474301, + "input_chunk_length": 50.88499202512683, + "layer_widths": 1010.2074180564931, + "num_blocks": 7.2488452757955475, + "num_layers": 9.777378079492287, + "num_stacks": 41.80114254147596, + "time_bin_size_in_minutes": 8.77778411907413, + "y_presentation": 0.21956300346363777, + } + bad_point_to_write_clean = { + "MaxPool1d": True, + "batch_size": 770, + "covariates_to_use": 1, + "dropout": 0.012044213327474301, + "input_chunk_length": 50, + "layer_widths": 1010, + "num_blocks": 7, + "num_layers": 9, + "num_stacks": 41, + "time_bin_size_in_minutes": 10, + "y_presentation": 0, + } + + print( + f""" +bad_point_to_write = {pprint.pformat(bad_point_to_write)} + +bad_point_to_write_clean = {pprint.pformat(bad_point_to_write_clean)} +""" + ) + + _model_name = model_wrapper_class.__name__.replace("Wrapper", "") + response = input( + f"Are you sure you want to manually write the above bad point for {_model_name}? " + ) + if response.lower() not in ["y", "yes"]: + sys.exit() + + response = input("Are you REALLY sure? ") + if response.lower() not in ["y", "yes"]: + sys.exit() + + write_manual_bad_point( + bad_point_to_write=bad_point_to_write, + bad_point_to_write_clean=bad_point_to_write_clean, + parent_wrapper=PARENT_WRAPPER, + model_wrapper_class=model_wrapper_class, + ) + + +if __name__ == "__main__": + run_write_manual_bad_point() # pylint: disable=no-value-for-parameter diff --git a/pyproject.toml b/pyproject.toml index 346aa8f3..6af38aaa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -198,6 +198,7 @@ extend-select = ["B901", "B902", "B903", "B904", "B905", "B906", "B907", "B908"] per-file-ignores = [ "ana/exploratory_ana.py:E402,E800,DALL000", "ana/drive_bayesian_opt.py:E402,CM001", + "ana/manual_bad_point.py:E402,E800", ] extend-exclude = [ ".cache", diff --git a/utils/bayesian_opt.py b/utils/bayesian_opt.py index e7671275..bb1233a6 100644 --- a/utils/bayesian_opt.py +++ b/utils/bayesian_opt.py @@ -388,8 +388,12 @@ def load_best_points( ].index.size, "minutes_elapsed_total": dfp["minutes_elapsed_total"].max(), "minutes_elapsed_point_best": best_dict["minutes_elapsed_point"], - "minutes_elapsed_mean": dfp["minutes_elapsed_point"].mean(), - "minutes_elapsed_stddev": dfp["minutes_elapsed_point"].std(), + "minutes_elapsed_mean": dfp.loc[dfp["model_name"] != "manual_bad_point"][ + "minutes_elapsed_point" + ].mean(), + "minutes_elapsed_stddev": dfp.loc[dfp["model_name"] != "manual_bad_point"][ + "minutes_elapsed_point" + ].std(), "id_point_best": best_dict["id_point"], "datetime_end_best": best_dict["datetime_end"], "params_best": ", ".join(best_params), @@ -1292,3 +1296,81 @@ def _build_error_msg(error_msg: str, error: Exception) -> str: optimizer.dispatch(Events.OPTIMIZATION_END) return optimizer.max, optimizer, exception_status + + +def write_manual_bad_point( + *, + bad_point_to_write: dict, + bad_point_to_write_clean: dict, + parent_wrapper: TSModelWrapper, + model_wrapper_class: WrapperTypes, + bayesian_opt_work_dir_name: str = "bayesian_optimization", +) -> None: + """Manually write a point, raw and clean, as a failed point to the JSON and CSV logs. + + This is useful when an iteration is killed by the OS with an uncatchable SIGKILL. + + Args: + bad_point_to_write (dict): Bad hyperparameter point to write, raw. + bad_point_to_write_clean (dict): Bad hyperparameter point to write, clean. + parent_wrapper (TSModelWrapper): TSModelWrapper object containing all parent configs. + model_wrapper_class (WrapperTypes): TSModelWrapper class to optimize. + bayesian_opt_work_dir_name (str): Directory name to save logs and models in, within the parent_wrapper.work_dir_base. (Default value = 'bayesian_optimization') + """ + model_wrapper = model_wrapper_class(TSModelWrapper=parent_wrapper) + optimizer = bayes_opt.BayesianOptimization( + f=None, pbounds={k: (None, None) for k, v in bad_point_to_write.items()} + ) + + # Setup Logging + generic_model_name: Final = model_wrapper.get_generic_model_name() + model_type: Final = model_wrapper.get_model_type() + bayesian_opt_work_dir: Final = pathlib.Path( + model_wrapper.work_dir_base, bayesian_opt_work_dir_name, generic_model_name + ).expanduser() + fname_json_log: Final = ( + bayesian_opt_work_dir / f"{BAYESIAN_OPT_PREFIX}{generic_model_name}.json" + ) + fname_csv_log: Final = bayesian_opt_work_dir / f"{BAYESIAN_OPT_PREFIX}{generic_model_name}.csv" + + # Reload prior points, must be done before json_logger is recreated to avoid duplicating past runs + json_logger = JSONLogger(path=str(fname_json_log), reset=False) + optimizer.subscribe(Events.OPTIMIZATION_STEP, json_logger) + + id_point = get_point_hash(bad_point_to_write_clean) + model_name = "manual_bad_point" + + optimizer.register(params=bad_point_to_write, target=BAD_TARGET) + datetime_end_str = get_datetime_str_from_json( + enable_json_logging=True, fname_json_log=fname_json_log + ) + + write_csv_row( + enable_csv_logging=True, + fname_csv_log=fname_csv_log, + datetime_start_str=datetime_end_str, + datetime_end_str=datetime_end_str, + id_point=id_point, + target=BAD_TARGET, + metrics_val=BAD_METRICS, + point=bad_point_to_write, + is_clean=False, + model_name=model_name, + model_type=model_type, + ) + + optimizer.register(params=bad_point_to_write_clean, target=BAD_TARGET) + + write_csv_row( + enable_csv_logging=True, + fname_csv_log=fname_csv_log, + datetime_start_str=datetime_end_str, + datetime_end_str=datetime_end_str, + id_point=id_point, + target=BAD_TARGET, + metrics_val=BAD_METRICS, + point=bad_point_to_write_clean, + is_clean=True, + model_name=model_name, + model_type=model_type, + )