Skip to content

Commit

Permalink
fix ci and add event logging to help debug ci
Browse files Browse the repository at this point in the history
Signed-off-by: jessestutler <[email protected]>
  • Loading branch information
JesseStutler committed Nov 14, 2024
1 parent 8b2f918 commit b5ee9ef
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 0 deletions.
15 changes: 15 additions & 0 deletions .github/workflows/e2e_spark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,21 @@ jobs:
name: "E2E about Spark Integration test"
runs-on: ubuntu-20.04
steps:
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false

# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true

- name: Checkout current Volcano repository
if: github.event.inputs.volcano-branch==''
Expand Down
23 changes: 23 additions & 0 deletions test/e2e/util/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"strings"
"time"

"github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
batchv1 "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -275,6 +276,7 @@ func CreateJobInner(ctx *TestContext, jobSpec *JobSpec) (*batchv1alpha1.Job, err

func WaitTaskPhase(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase, taskNum int) error {
var additionalError error
podNotReadyCache := make(map[string]*v1.Pod)
err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) {
pods, err := ctx.Kubeclient.CoreV1().Pods(job.Namespace).List(context.TODO(), metav1.ListOptions{})
Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", job.Namespace)
Expand All @@ -285,9 +287,13 @@ func WaitTaskPhase(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase
continue
}

podKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)
podNotReadyCache[podKey] = &pod
for _, p := range phase {
if pod.Status.Phase == p {
// pod turns to expected phase
readyTaskNum++
delete(podNotReadyCache, podKey)
break
}
}
Expand All @@ -302,11 +308,28 @@ func WaitTaskPhase(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase
return ready, nil
})
if err != nil && strings.Contains(err.Error(), TimeOutMessage) {
logEventsOfNotReadyPods(ctx, podNotReadyCache, phase)
return fmt.Errorf("[Wait time out]: %s", additionalError)
}
return err
}

func logEventsOfNotReadyPods(ctx *TestContext, podNotReadyCache map[string]*v1.Pod, phase []v1.PodPhase) {
for _, pod := range podNotReadyCache {
timestamp := time.Now().Format(LogTimeFormat)
ginkgo.GinkgoWriter.Printf("[Error]%s The pod <%s/%s> is not in %v phase\n", timestamp, pod.Namespace, pod.Name, phase)
// Currently, we only filter Failed event
fieldSelector := fmt.Sprintf("involvedObject.kind=Pod,involvedObject.name=%s,reason=Failed", pod.Name)
events, err := ctx.Kubeclient.CoreV1().Events(pod.Namespace).List(context.TODO(), metav1.ListOptions{
FieldSelector: fieldSelector,
})
Expect(err).NotTo(HaveOccurred(), "failed to get events related with pod %s in namespace %s", pod.Name, pod.Namespace)
for _, event := range events.Items {
ginkgo.GinkgoWriter.Printf("[Error]%s Event related with pod <%s/%s>: Reason: %s, Message: %s\n", timestamp, pod.Namespace, pod.Name, event.Reason, event.Message)
}
}
}

func taskPhaseEx(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase, taskNum map[string]int) error {
err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) {

Expand Down
1 change: 1 addition & 0 deletions test/e2e/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ const (
DefaultTFImage = "volcanosh/dist-mnist-tf-example:0.0.1"
// "volcanosh/pytorch-mnist-v1beta1-9ee8fda-example:0.0.1" is from "docker.io/kubeflowkatib/pytorch-mnist:v1beta1-9ee8fda"
DefaultPytorchImage = "volcanosh/pytorch-mnist-v1beta1-9ee8fda-example:0.0.1"
LogTimeFormat = "[ 2006/01/02 15:04:05.000 ]"
)

func CPUResource(request string) v1.ResourceList {
Expand Down

0 comments on commit b5ee9ef

Please sign in to comment.