Skip to content

Commit

Permalink
fix: amd/linux script improvements; exit_on_unhandled_faults improv…
Browse files Browse the repository at this point in the history
…ement

fix: respect `amd` CLI arg for `run_worker.py`
fix: tar command
fix: use correct pytorch channel for rocm
fix: els0
fix: more aggressive exiting with `exit_on_unhandled_faults: true`
feat: `preload-models.sh` - This complements the existing .cmd script for windows
fix: set +x on `preload-models.sh`; skip jemalloc check
fix: set +x on preload-models.sh
fix: make sure micromamba has full path
Co-Authored-By: Divided by Zer0 <[email protected]>
  • Loading branch information
tazlin and db0 committed Aug 22, 2024
1 parent cccfe83 commit f567f26
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 22 deletions.
33 changes: 25 additions & 8 deletions horde_worker_regen/process_management/process_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3084,7 +3084,7 @@ async def api_job_pop(self) -> None:
)
if self.bridge_data.exit_on_unhandled_faults:
logger.error("Exiting due to exit_on_unhandled_faults being enabled")
sys.exit(1)
self._abort()
await asyncio.sleep(180)
self._consecutive_failed_jobs = 0
logger.info("Resuming job pops")
Expand Down Expand Up @@ -3610,7 +3610,7 @@ def detect_deadlock(self) -> None:
and self._process_map.num_busy_processes() == 0
):
logger.debug("Deadlock still detected after 10 seconds. Attempting to recover.")
self._cleanup_jobs()
self._purge_jobs()
self._in_deadlock = False
elif (
self._in_deadlock
Expand Down Expand Up @@ -3792,6 +3792,7 @@ def signal_handler(self, sig: int, frame: object) -> None:
"""Handle SIGINT and SIGTERM."""
if self._caught_sigints >= 2:
logger.warning("Caught SIGINT or SIGTERM three times, exiting immediately")
self._start_timed_shutdown()
sys.exit(1)

self._caught_sigints += 1
Expand All @@ -3817,7 +3818,14 @@ def shutdown() -> None:

_recently_recovered = False

def _cleanup_jobs(self) -> None:
def _purge_jobs(self) -> None:
"""Clear all jobs immediately.
Note: This is a last resort and should only be used when the worker is in a black hole and can't recover.
Jobs will timeout on the server side and be requeued if they are still valid but due to the worker not
responding, they will spend much longer in the queue than they should while the server waits for the worker
to respond (and ultimately times out).
"""
if len(self.job_deque) > 0:
self.job_deque.clear()
self._last_job_submitted_time = time.time()
Expand Down Expand Up @@ -3849,6 +3857,7 @@ def _hard_kill_processes(
safety: bool = True,
all_: bool = True,
) -> None:
"""Kill all processes immediately."""
for process_info in self._process_map.values():
if (
(inference and process_info.process_type == HordeProcessType.INFERENCE)
Expand Down Expand Up @@ -3893,6 +3902,17 @@ def _check_and_replace_process(
return True
return False

def _abort(self) -> None:
"""Exit as soon as possible, aborting all processes and jobs immediately."""
with logger.catch(), open(".abort", "w") as f:
f.write("")

self._purge_jobs()

self._shutting_down = True
self._hard_kill_processes()
self._start_timed_shutdown()

def replace_hung_processes(self) -> bool:
"""Replaces processes that haven't checked in since `process_timeout` seconds in bridgeData."""
now = time.time()
Expand All @@ -3913,15 +3933,12 @@ def timed_unset_recently_recovered() -> None:
)
or ((now - self._last_job_submitted_time) > self.bridge_data.process_timeout)
) and not (self._last_pop_no_jobs_available or self._recently_recovered):
self._cleanup_jobs()
self._purge_jobs()

if self.bridge_data.exit_on_unhandled_faults:
logger.error("All processes have been unresponsive for too long, exiting.")

self._shutting_down = True
self._hard_kill_processes()
self._start_timed_shutdown()

self._abort()
logger.error("Exiting due to exit_on_unhandled_faults being enabled")

return True
Expand Down
7 changes: 6 additions & 1 deletion horde_worker_regen/run_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ def init() -> None:
with contextlib.suppress(Exception):
multiprocessing.set_start_method("spawn", force=True)

if os.path.exists(".abort"):
with logger.catch(reraise=True):
os.remove(".abort")
logger.debug("Removed .abort file")

print(f"Multiprocessing start method: {multiprocessing.get_start_method()}")

# Create args for -v, allowing -vvv
Expand Down Expand Up @@ -201,7 +206,7 @@ def init() -> None:

# We only need to download the legacy DBs once, so we do it here instead of in the worker processes

main(multiprocessing.get_context("spawn"), args.load_config_from_env_vars)
main(multiprocessing.get_context("spawn"), args.load_config_from_env_vars, amd_gpu=args.amd)


if __name__ == "__main__":
Expand Down
9 changes: 9 additions & 0 deletions preload-models.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
# Get the directory of the current script
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"

if "$SCRIPT_DIR/runtime.sh" python -s "$SCRIPT_DIR/download_models.py"; then
echo "Model Download OK."
else
echo "download_models.py exited with error code."
fi
14 changes: 7 additions & 7 deletions update-runtime-rocm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,18 @@ done

CONDA_ENVIRONMENT_FILE=environment.rocm.yaml

wget -qO- https://github.com/mamba-org/micromamba-releases/releases/latest/download/micromamba-linux-64.tar.bz2 | tar -xvj "$SCRIPT_DIR/bin/micromamba"
wget -qO- https://github.com/mamba-org/micromamba-releases/releases/latest/download/micromamba-linux-64.tar.bz2 | tar -xvj -C "${SCRIPT_DIR}"
if [ ! -f "$SCRIPT_DIR/conda/envs/linux/bin/python" ]; then
bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y
${SCRIPT_DIR}/bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y
fi
bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y
${SCRIPT_DIR}/bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y

if [ "$hordelib" = true ]; then
bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip uninstall -y hordelib horde_engine horde_sdk horde_model_reference
bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install horde_engine horde_model_reference --extra-index-url https://download.pytorch.org/whl/rocm6.1
${SCRIPT_DIR}/bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip uninstall -y hordelib horde_engine horde_sdk horde_model_reference
${SCRIPT_DIR}/bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install horde_engine horde_model_reference --extra-index-url https://download.pytorch.org/whl/rocm6.0
else
bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install -r "$SCRIPT_DIR/requirements.rocm.txt" -U --extra-index-url https://download.pytorch.org/whl/rocm6.1
${SCRIPT_DIR}/bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install -r "$SCRIPT_DIR/requirements.rocm.txt" -U --extra-index-url https://download.pytorch.org/whl/rocm6.0

fi

bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux "$SCRIPT_DIR/horde_worker_regen/amd_go_fast/install_amd_go_fast.sh"
${SCRIPT_DIR}/bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux "$SCRIPT_DIR/horde_worker_regen/amd_go_fast/install_amd_go_fast.sh"
12 changes: 6 additions & 6 deletions update-runtime.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@ done

CONDA_ENVIRONMENT_FILE=environment.yaml

wget -qO- https://github.com/mamba-org/micromamba-releases/releases/latest/download/micromamba-linux-64.tar.bz2 | tar -xvj "$SCRIPT_DIR/bin/micromamba"
wget -qO- https://github.com/mamba-org/micromamba-releases/releases/latest/download/micromamba-linux-64.tar.bz2 | tar -xvj -C "${SCRIPT_DIR}"
if [ ! -f "$SCRIPT_DIR/conda/envs/linux/bin/python" ]; then
bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y
${SCRIPT_DIR}/bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y
fi
bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y
${SCRIPT_DIR}/bin/micromamba create --no-shortcuts -r "$SCRIPT_DIR/conda" -n linux -f ${CONDA_ENVIRONMENT_FILE} -y

if [ "$hordelib" = true ]; then
bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip uninstall -y hordelib horde_engine horde_sdk horde_model_reference
bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install horde_engine horde_model_reference --extra-index-url https://download.pytorch.org/whl/cu121
${SCRIPT_DIR}/bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip uninstall -y hordelib horde_engine horde_sdk horde_model_reference
${SCRIPT_DIR}/bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install horde_engine horde_model_reference --extra-index-url https://download.pytorch.org/whl/cu121
else
bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install -r "$SCRIPT_DIR/requirements.txt" -U --extra-index-url https://download.pytorch.org/whl/cu121
${SCRIPT_DIR}/bin/micromamba run -r "$SCRIPT_DIR/conda" -n linux python -s -m pip install -r "$SCRIPT_DIR/requirements.txt" -U --extra-index-url https://download.pytorch.org/whl/cu121
fi

0 comments on commit f567f26

Please sign in to comment.