Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[UX] Add infeasibility reasons to the exception message #3986

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pathlib
import re
import shlex
import shutil
import signal
import subprocess
import sys
Expand Down Expand Up @@ -151,6 +152,9 @@
# might be added during ssh.
_MAX_INLINE_SCRIPT_LENGTH = 120 * 1024

_RESOURCES_UNAVAILABLE_LOG = (
'Reasons for provision failures (for details, please check the log above):')


def _is_command_length_over_limit(command: str) -> bool:
"""Check if the length of the command exceeds the limit.
Expand Down Expand Up @@ -1926,6 +1930,7 @@ def provision_with_retries(
self._optimize_target is None)

failover_history: List[Exception] = list()
resource_exceptions: Dict[resources_lib.Resources, Exception] = dict()

style = colorama.Style
fore = colorama.Fore
Expand Down Expand Up @@ -2016,6 +2021,7 @@ def provision_with_retries(
# Add failed resources to the blocklist, only when it
# is in fallback mode.
_add_to_blocked_resources(self._blocked_resources, to_provision)
resource_exceptions[to_provision] = failover_history[-1]
else:
# If we reach here, it means that the existing cluster must have
# a previous status of INIT, because other statuses (UP,
Expand Down Expand Up @@ -2052,7 +2058,22 @@ def provision_with_retries(
# possible resources or the requested resources is too
# restrictive. If we reach here, our failover logic finally
# ends here.
raise e.with_failover_history(failover_history)
table = log_utils.create_table(['Resource', 'Reason'])
for (resource, exception) in resource_exceptions.items():
launched_resource_str = str(resource)
# accelerator_args is way too long.
# Convert from:
# GCP(n1-highmem-8, {'tpu-v2-8': 1}, accelerator_args={'runtime_version': '2.12.0'} # pylint: disable=line-too-long
# to:
# GCP(n1-highmem-8, {'tpu-v2-8': 1}...)
pattern = ', accelerator_args={.*}'
launched_resource_str = re.sub(pattern, '...',
launched_resource_str)
table.add_row([launched_resource_str, exception])
table.max_table_width = shutil.get_terminal_size().columns
raise exceptions.ResourcesUnavailableError(
_RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
failover_history=failover_history)
to_provision = task.best_resources
assert task in self._dag.tasks, 'Internal logic error.'
assert to_provision is not None, task
Expand Down Expand Up @@ -2805,7 +2826,7 @@ def _provision(
'`--retry-until-up` flag.')
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesUnavailableError(
error_message,
error_message + '\n' + str(e),
failover_history=e.failover_history) from None
if dryrun:
record = global_user_state.get_cluster_from_name(cluster_name)
Expand Down