From f567f265c8bfc404acf3c75edaea2e0b3ee3e631 Mon Sep 17 00:00:00 2001 From: tazlin Date: Mon, 5 Aug 2024 19:44:58 -0400 Subject: [PATCH] fix: amd/linux script improvements; `exit_on_unhandled_faults` improvement fix: respect `amd` CLI arg for `run_worker.py` fix: tar command fix: use correct pytorch channel for rocm fix: els0 fix: more aggressive exiting with `exit_on_unhandled_faults: true` feat: `preload-models.sh` - This complements the existing .cmd script for windows fix: set +x on `preload-models.sh`; skip jemalloc check fix: set +x on preload-models.sh fix: make sure micromamba has full path Co-Authored-By: Divided by Zer0 --- .../process_management/process_manager.py | 33 ++++++++++++++----- horde_worker_regen/run_worker.py | 7 +++- preload-models.sh | 9 +++++ update-runtime-rocm.sh | 14 ++++---- update-runtime.sh | 12 +++---- 5 files changed, 53 insertions(+), 22 deletions(-) create mode 100755 preload-models.sh diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index 4fb37b05..9cc1746f 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -3084,7 +3084,7 @@ async def api_job_pop(self) -> None: ) if self.bridge_data.exit_on_unhandled_faults: logger.error("Exiting due to exit_on_unhandled_faults being enabled") - sys.exit(1) + self._abort() await asyncio.sleep(180) self._consecutive_failed_jobs = 0 logger.info("Resuming job pops") @@ -3610,7 +3610,7 @@ def detect_deadlock(self) -> None: and self._process_map.num_busy_processes() == 0 ): logger.debug("Deadlock still detected after 10 seconds. Attempting to recover.") - self._cleanup_jobs() + self._purge_jobs() self._in_deadlock = False elif ( self._in_deadlock @@ -3792,6 +3792,7 @@ def signal_handler(self, sig: int, frame: object) -> None: """Handle SIGINT and SIGTERM.""" if self._caught_sigints >= 2: logger.warning("Caught SIGINT or SIGTERM three times, exiting immediately") + self._start_timed_shutdown() sys.exit(1) self._caught_sigints += 1 @@ -3817,7 +3818,14 @@ def shutdown() -> None: _recently_recovered = False - def _cleanup_jobs(self) -> None: + def _purge_jobs(self) -> None: + """Clear all jobs immediately. + + Note: This is a last resort and should only be used when the worker is in a black hole and can't recover. + Jobs will timeout on the server side and be requeued if they are still valid but due to the worker not + responding, they will spend much longer in the queue than they should while the server waits for the worker + to respond (and ultimately times out). + """ if len(self.job_deque) > 0: self.job_deque.clear() self._last_job_submitted_time = time.time() @@ -3849,6 +3857,7 @@ def _hard_kill_processes( safety: bool = True, all_: bool = True, ) -> None: + """Kill all processes immediately.""" for process_info in self._process_map.values(): if ( (inference and process_info.process_type == HordeProcessType.INFERENCE) @@ -3893,6 +3902,17 @@ def _check_and_replace_process( return True return False + def _abort(self) -> None: + """Exit as soon as possible, aborting all processes and jobs immediately.""" + with logger.catch(), open(".abort", "w") as f: + f.write("") + + self._purge_jobs() + + self._shutting_down = True + self._hard_kill_processes() + self._start_timed_shutdown() + def replace_hung_processes(self) -> bool: """Replaces processes that haven't checked in since `process_timeout` seconds in bridgeData.""" now = time.time() @@ -3913,15 +3933,12 @@ def timed_unset_recently_recovered() -> None: ) or ((now - self._last_job_submitted_time) > self.bridge_data.process_timeout) ) and not (self._last_pop_no_jobs_available or self._recently_recovered): - self._cleanup_jobs() + self._purge_jobs() if self.bridge_data.exit_on_unhandled_faults: logger.error("All processes have been unresponsive for too long, exiting.") - self._shutting_down = True - self._hard_kill_processes() - self._start_timed_shutdown() - + self._abort() logger.error("Exiting due to exit_on_unhandled_faults being enabled") return True diff --git a/horde_worker_regen/run_worker.py b/horde_worker_regen/run_worker.py index 55b02c9e..8f48af80 100644 --- a/horde_worker_regen/run_worker.py +++ b/horde_worker_regen/run_worker.py @@ -133,6 +133,11 @@ def init() -> None: with contextlib.suppress(Exception): multiprocessing.set_start_method("spawn", force=True) + if os.path.exists(".abort"): + with logger.catch(reraise=True): + os.remove(".abort") + logger.debug("Removed .abort file") + print(f"Multiprocessing start method: {multiprocessing.get_start_method()}") # Create args for -v, allowing -vvv @@ -201,7 +206,7 @@ def init() -> None: # We only need to download the legacy DBs once, so we do it here instead of in the worker processes - main(multiprocessing.get_context("spawn"), args.load_config_from_env_vars) + main(multiprocessing.get_context("spawn"), args.load_config_from_env_vars, amd_gpu=args.amd) if __name__ == "__main__": diff --git a/preload-models.sh b/preload-models.sh new file mode 100755 index 00000000..25181754 --- /dev/null +++ b/preload-models.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# Get the directory of the current script +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +if "$SCRIPT_DIR/runtime.sh" python -s "$SCRIPT_DIR/download_models.py"; then + echo "Model Download OK." +else + echo "download_models.py exited with error code." +fi diff --git a/update-runtime-rocm.sh b/update-runtime-rocm.sh index 84f5794c..7e137257 100755 --- a/update-runtime-rocm.sh +++ b/update-runtime-rocm.sh @@ -27,18 +27,18 @@ done CONDA_ENVIRONMENT_FILE=environment.rocm.yaml -wget -qO- https://github.com/mamba-org/micromamba-releases/releases/latest/download/micromamba-linux-64.tar.bz2 | tar -xvj "$SCRIPT_DIR/bin/micromamba" +wget -qO- https://github.com/mamba-org/micromamba-releases/releases/latest/download/micromamba-linux-64.tar.bz2 | tar -xvj -C "${SCRIPT_DIR}" if [ ! -f "$SCRIPT_DIR/conda/envs/linux/bin/python" ]; then - bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y + ${SCRIPT_DIR}/bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y fi -bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y +${SCRIPT_DIR}/bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y if [ "$hordelib" = true ]; then - bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip uninstall -y hordelib horde_engine horde_sdk horde_model_reference - bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install horde_engine horde_model_reference --extra-index-url https://download.pytorch.org/whl/rocm6.1 + ${SCRIPT_DIR}/bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip uninstall -y hordelib horde_engine horde_sdk horde_model_reference + ${SCRIPT_DIR}/bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install horde_engine horde_model_reference --extra-index-url https://download.pytorch.org/whl/rocm6.0 else - bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install -r "$SCRIPT_DIR/requirements.rocm.txt" -U --extra-index-url https://download.pytorch.org/whl/rocm6.1 + ${SCRIPT_DIR}/bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install -r "$SCRIPT_DIR/requirements.rocm.txt" -U --extra-index-url https://download.pytorch.org/whl/rocm6.0 fi -bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux "$SCRIPT_DIR/horde_worker_regen/amd_go_fast/install_amd_go_fast.sh" +${SCRIPT_DIR}/bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux "$SCRIPT_DIR/horde_worker_regen/amd_go_fast/install_amd_go_fast.sh" diff --git a/update-runtime.sh b/update-runtime.sh index cbd8580e..30eeb060 100755 --- a/update-runtime.sh +++ b/update-runtime.sh @@ -27,15 +27,15 @@ done CONDA_ENVIRONMENT_FILE=environment.yaml -wget -qO- https://github.com/mamba-org/micromamba-releases/releases/latest/download/micromamba-linux-64.tar.bz2 | tar -xvj "$SCRIPT_DIR/bin/micromamba" +wget -qO- https://github.com/mamba-org/micromamba-releases/releases/latest/download/micromamba-linux-64.tar.bz2 | tar -xvj -C "${SCRIPT_DIR}" if [ ! -f "$SCRIPT_DIR/conda/envs/linux/bin/python" ]; then - bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y + ${SCRIPT_DIR}/bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y fi -bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y +${SCRIPT_DIR}/bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y if [ "$hordelib" = true ]; then - bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip uninstall -y hordelib horde_engine horde_sdk horde_model_reference - bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install horde_engine horde_model_reference --extra-index-url https://download.pytorch.org/whl/cu121 + ${SCRIPT_DIR}/bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip uninstall -y hordelib horde_engine horde_sdk horde_model_reference + ${SCRIPT_DIR}/bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install horde_engine horde_model_reference --extra-index-url https://download.pytorch.org/whl/cu121 else - bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install -r "$SCRIPT_DIR/requirements.txt" -U --extra-index-url https://download.pytorch.org/whl/cu121 + ${SCRIPT_DIR}/bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install -r "$SCRIPT_DIR/requirements.txt" -U --extra-index-url https://download.pytorch.org/whl/cu121 fi