Skip to content

Commit

Permalink
check the logs of pod
Browse files Browse the repository at this point in the history
Signed-off-by: helenxie-bit <[email protected]>
  • Loading branch information
helenxie-bit committed Sep 13, 2024
1 parent 08c8634 commit 7a98a00
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 4 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/e2e-test-tune-api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ jobs:
tune-api: true
training-operator: true

# Step to get logs of the relevant Experiment pod
- name: Fetch Experiment Pod Logs
run: |
POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2)
echo "Fetching logs for pod: $POD_NAME"
kubectl logs $POD_NAME -n default
strategy:
fail-fast: false
matrix:
Expand Down
27 changes: 23 additions & 4 deletions test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,30 @@
from verify import verify_experiment_results

# Experiment timeout is 60 min.
EXPERIMENT_TIMEOUT = 60 * 60
EXPERIMENT_TIMEOUT = 60 * 15

# The default logging config.
logging.basicConfig(level=logging.INFO)


def get_experiment_pods_logs(katib_client: KatibClient, exp_name: str, exp_namespace: str):
# List all the pods in the namespace
v1 = client.CoreV1Api()
pods = v1.list_namespaced_pod(namespace=exp_namespace)

# Filter pods related to the specific Katib Experiment
for pod in pods.items:
if exp_name in pod.metadata.name:
logging.info(f"Fetching logs for pod: {pod.metadata.name}")
try:
pod_logs = v1.read_namespaced_pod_log(
name=pod.metadata.name, namespace=exp_namespace
)
logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs}")
except Exception as e:
logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}")


# Test for Experiment created with custom objective.
def run_e2e_experiment_create_by_tune_with_custom_objective(
katib_client: KatibClient,
Expand Down Expand Up @@ -117,7 +135,7 @@ def run_e2e_experiment_create_by_tune_with_external_model(
resources_per_trial=katib.TrainerResources(
num_workers=1,
num_procs_per_worker=1,
resources_per_worker={"cpu": "2", "memory": "10G",},
resources_per_worker={"cpu": "1", "memory": "10G",},
),
)
experiment = katib_client.wait_for_experiment_condition(
Expand Down Expand Up @@ -166,7 +184,7 @@ def run_e2e_experiment_create_by_tune_with_external_model(
# Delete the Experiment.
logging.info("---------------------------------------------------------------")
logging.info("---------------------------------------------------------------")
katib_client.delete_experiment(f"{exp_name}-1", exp_namespace)
#katib_client.delete_experiment(f"{exp_name}-1", exp_namespace)

try:
run_e2e_experiment_create_by_tune_with_external_model(katib_client, f"{exp_name}-2", exp_namespace)
Expand All @@ -175,9 +193,10 @@ def run_e2e_experiment_create_by_tune_with_external_model(
except Exception as e:
logging.info("---------------------------------------------------------------")
logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2")
get_experiment_pods_logs(katib_client, f"{exp_name}-2", exp_namespace)
raise e
finally:
# Delete the Experiment.
logging.info("---------------------------------------------------------------")
logging.info("---------------------------------------------------------------")
katib_client.delete_experiment(f"{exp_name}-2", exp_namespace)
#katib_client.delete_experiment(f"{exp_name}-2", exp_namespace)

0 comments on commit 7a98a00

Please sign in to comment.