Merge remote-tracking branch 'origin/main' into node-info-resource-type

tensorchord · Oct 24, 2023 · 6588898 · 6588898
2 parents be60274 + dec5979
commit 6588898
Show file tree

Hide file tree

Showing 16 changed files with 142 additions and 57 deletions.
diff --git a/agent/api/types/const.go b/agent/api/types/const.go
diff --git a/agent/api/types/event.go b/agent/api/types/event.go
@@ -9,6 +9,9 @@ const (
 	DeploymentScaleUpEvent    = "deployment-scale-up"
 	DeploymentScaleDownEvent  = "deployment-scale-down"
 	DeploymentScaleBlockEvent = "deployment-scale-block"
+	PodCreateEvent            = "pod-create"
+	PodReadyEvent             = "pod-ready"
+	PodTimeoutEvent           = "pod-timeout"
 )
 
 type DeploymentEvent struct {

diff --git a/agent/pkg/consts/consts.go b/agent/pkg/consts/consts.go
@@ -3,12 +3,6 @@ package consts
 import "time"
 
 const (
-	LabelBuildName            = "ai.tensorchord.build"
-	LabelName                 = "ai.tensorchord.name"
-	LabelServerResource       = "ai.tensorchord.server-resource"
-	AnnotationControlPlaneKey = "ai.tensorchord.control-plane"
-	ModelzAnnotationValue     = "modelz"
-
 	Domain        = "modelz.live"
 	DefaultPrefix = "modelz-"
 	APIKEY_PREFIX = "mzi-"

diff --git a/agent/pkg/k8s/convert_inference.go b/agent/pkg/k8s/convert_inference.go
@@ -58,23 +58,7 @@ func AsInferenceDeployment(inf *v2alpha1.Inference, item *appsv1.Deployment) *ty
 		res.Status.CreatedAt = &item.CreationTimestamp.Time
 		res.Status.InvocationCount = 0
 		res.Status.AvailableReplicas = item.Status.AvailableReplicas
-
-		res.Status.Phase = types.PhaseNotReady
-		for _, c := range item.Status.Conditions {
-			if c.Type == appsv1.DeploymentAvailable && c.Status == v1.ConditionTrue {
-				res.Status.Phase = types.PhaseReady
-			} else if c.Type == appsv1.DeploymentProgressing && c.Status == v1.ConditionFalse {
-				res.Status.Phase = types.PhaseScaling
-			}
-		}
-
-		if item.Spec.Replicas != nil && *item.Spec.Replicas == 0 {
-			res.Status.Phase = types.PhaseNoReplicas
-		}
-
-		if item.DeletionTimestamp != nil {
-			res.Status.Phase = types.PhaseTerminating
-		}
+		res.Status.Phase = AsStatusPhase(item)
 	}
 	return res
 }
@@ -98,3 +82,23 @@ func AsResourceList(resources v1.ResourceList) types.ResourceList {
 	}
 	return res
 }
+
+func AsStatusPhase(item *appsv1.Deployment) types.Phase {
+	phase := types.PhaseNotReady
+	for _, c := range item.Status.Conditions {
+		if c.Type == appsv1.DeploymentAvailable && c.Status == v1.ConditionTrue {
+			phase = types.PhaseReady
+		} else if c.Type == appsv1.DeploymentProgressing && c.Status == v1.ConditionFalse {
+			phase = types.PhaseScaling
+		}
+	}
+
+	if item.Spec.Replicas != nil && *item.Spec.Replicas == 0 {
+		phase = types.PhaseNoReplicas
+	}
+
+	if item.DeletionTimestamp != nil {
+		phase = types.PhaseTerminating
+	}
+	return phase
+}
diff --git a/agent/pkg/k8s/generate_image_cache.go b/agent/pkg/k8s/generate_image_cache.go
@@ -5,8 +5,8 @@ import (
 
 	kubefledged "github.com/senthilrch/kube-fledged/pkg/apis/kubefledged/v1alpha3"
 	"github.com/tensorchord/openmodelz/agent/api/types"
-	"github.com/tensorchord/openmodelz/agent/pkg/consts"
 	modelzetes "github.com/tensorchord/openmodelz/modelzetes/pkg/apis/modelzetes/v2alpha1"
+	"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime/schema"

diff --git a/agent/pkg/k8s/generate_job.go b/agent/pkg/k8s/generate_job.go
@@ -10,9 +10,8 @@ import (
 	"k8s.io/apimachinery/pkg/runtime/schema"
 
 	"github.com/tensorchord/openmodelz/agent/api/types"
-	"github.com/tensorchord/openmodelz/agent/pkg/consts"
 	"github.com/tensorchord/openmodelz/modelzetes/pkg/apis/modelzetes/v2alpha1"
-	mzconsts "github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
+	"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
 )
 
 func MakeBuild(req types.Build, inference *v2alpha1.Inference, builderImage, buildkitdAddr, buildctlBin, secret string) (*batchv1.Job, error) {
@@ -70,8 +69,8 @@ func MakeBuild(req types.Build, inference *v2alpha1.Inference, builderImage, bui
 			Namespace:       req.Spec.Namespace,
 			OwnerReferences: ownerReference,
 			Labels: map[string]string{
-				consts.LabelBuildName:       req.Spec.Name,
-				mzconsts.AnnotationBuilding: "true",
+				consts.LabelBuildName:     req.Spec.Name,
+				consts.AnnotationBuilding: "true",
 			},
 		},
 		Spec: batchv1.JobSpec{

diff --git a/agent/pkg/metrics/exporter.go b/agent/pkg/metrics/exporter.go
@@ -55,6 +55,8 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
 	e.metricOptions.ServiceAvailableReplicasGauge.Reset()
 	e.metricOptions.ServiceTargetLoad.Reset()
 
+	e.metricOptions.PodStartHistogram.Collect(ch)
+
 	for _, service := range e.services {
 		var serviceName string
 		if len(service.Spec.Namespace) > 0 {

diff --git a/agent/pkg/metrics/metrics.go b/agent/pkg/metrics/metrics.go
@@ -21,6 +21,8 @@ type MetricOptions struct {
 	ServiceReplicasGauge          *prometheus.GaugeVec
 	ServiceAvailableReplicasGauge *prometheus.GaugeVec
 	ServiceTargetLoad             *prometheus.GaugeVec
+
+	PodStartHistogram *prometheus.HistogramVec
 }
 
 // ServiceMetricOptions provides RED metrics
@@ -108,6 +110,12 @@ func BuildMetricsOptions() MetricOptions {
 		[]string{"inference_name"},
 	)
 
+	podStartHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{
+		Name:    "pod_start_seconds",
+		Help:    "Pod start time taken",
+		Buckets: prometheus.ExponentialBuckets(8, 1.5, 10),
+	}, []string{"inference_name", "source_image"})
+
 	metricsOptions := MetricOptions{
 		GatewayInferencesHistogram:         gatewayInferencesHistogram,
 		GatewayInferenceInvocation:         gatewayInferenceInvocation,
@@ -116,6 +124,7 @@ func BuildMetricsOptions() MetricOptions {
 		ServiceTargetLoad:                  serviceTargetLoad,
 		GatewayInferenceInvocationStarted:  gatewayInferenceInvocationStarted,
 		GatewayInferenceInvocationInflight: gatewayInferenceInvocationInflight,
+		PodStartHistogram:                  podStartHistogram,
 	}
 
 	return metricsOptions

diff --git a/agent/pkg/runtime/inference_create.go b/agent/pkg/runtime/inference_create.go
@@ -4,9 +4,6 @@ import (
 	"context"
 	"fmt"
 
-	ingressv1 "github.com/tensorchord/openmodelz/ingress-operator/pkg/apis/modelzetes/v1"
-	v2alpha1 "github.com/tensorchord/openmodelz/modelzetes/pkg/apis/modelzetes/v2alpha1"
-	"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
 	v1 "k8s.io/api/core/v1"
 	k8serrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -15,6 +12,9 @@ import (
 	"github.com/tensorchord/openmodelz/agent/errdefs"
 	"github.com/tensorchord/openmodelz/agent/pkg/config"
 	localconsts "github.com/tensorchord/openmodelz/agent/pkg/consts"
+	ingressv1 "github.com/tensorchord/openmodelz/ingress-operator/pkg/apis/modelzetes/v1"
+	v2alpha1 "github.com/tensorchord/openmodelz/modelzetes/pkg/apis/modelzetes/v2alpha1"
+	"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
 )
 
 func (r generalRuntime) InferenceCreate(ctx context.Context,
@@ -37,7 +37,7 @@ func (r generalRuntime) InferenceCreate(ctx context.Context,
 	// Create the ingress
 	// TODO(gaocegege): Check if the domain is already used.
 	if r.ingressEnabled {
-		name := req.Spec.Labels[localconsts.LabelName]
+		name := req.Spec.Labels[consts.LabelName]
 
 		if r.ingressAnyIPToDomain {
 			// Get the service with type=loadbalancer.
@@ -208,8 +208,8 @@ func makeIngress(request types.InferenceDeployment, cfg config.IngressConfig) (*
 	}
 
 	annotation := map[string]string{}
-	if value, exist := request.Spec.Annotations[localconsts.AnnotationControlPlaneKey]; exist {
-		annotation[localconsts.AnnotationControlPlaneKey] = value
+	if value, exist := request.Spec.Annotations[consts.AnnotationControlPlaneKey]; exist {
+		annotation[consts.AnnotationControlPlaneKey] = value
 	}
 	ingress.Annotations = annotation
 

diff --git a/agent/pkg/runtime/namespace.go b/agent/pkg/runtime/namespace.go
@@ -8,14 +8,14 @@ import (
 	k8serrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
-	"github.com/tensorchord/openmodelz/agent/api/types"
 	"github.com/tensorchord/openmodelz/agent/errdefs"
+	"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
 )
 
 func (r generalRuntime) NamespaceList(ctx context.Context) ([]string, error) {
 	ns, err := r.kubeClient.CoreV1().Namespaces().List(ctx,
 		metav1.ListOptions{
-			LabelSelector: fmt.Sprintf("%s=true", types.LabelNamespace),
+			LabelSelector: fmt.Sprintf("%s=true", consts.LabelNamespace),
 		})
 	if err != nil {
 		if k8serrors.IsNotFound(err) {
@@ -37,7 +37,7 @@ func (r generalRuntime) NamespaceCreate(ctx context.Context, name string) error
 		ObjectMeta: metav1.ObjectMeta{
 			Name: name,
 			Labels: map[string]string{
-				types.LabelNamespace: "true",
+				consts.LabelNamespace: "true",
 			},
 		},
 	}

diff --git a/agent/pkg/runtime/node.go b/agent/pkg/runtime/node.go
@@ -4,7 +4,7 @@ import (
 	"context"
 
 	"github.com/sirupsen/logrus"
-	"github.com/tensorchord/openmodelz/agent/pkg/consts"
+	"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 

diff --git a/agent/pkg/runtime/runtime.go b/agent/pkg/runtime/runtime.go
@@ -4,6 +4,7 @@ import (
 	"context"
 
 	"github.com/gin-gonic/gin"
+
 	"github.com/sirupsen/logrus"
 	apicorev1 "k8s.io/api/core/v1"
 	appsv1 "k8s.io/client-go/informers/apps/v1"

diff --git a/agent/pkg/server/server_init_kubernetes.go b/agent/pkg/server/server_init_kubernetes.go
@@ -1,15 +1,23 @@
 package server
 
 import (
+	"context"
 	"fmt"
+	"time"
 
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sirupsen/logrus"
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	kubeinformers "k8s.io/client-go/informers"
+	kubeinformersv1 "k8s.io/client-go/informers/core/v1"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/tools/cache"
 	"k8s.io/client-go/tools/clientcmd"
 
 	kubefledged "github.com/senthilrch/kube-fledged/pkg/client/clientset/versioned"
+	"github.com/tensorchord/openmodelz/agent/api/types"
+	"github.com/tensorchord/openmodelz/agent/pkg/event"
 	"github.com/tensorchord/openmodelz/agent/pkg/k8s"
 	"github.com/tensorchord/openmodelz/agent/pkg/log"
 	"github.com/tensorchord/openmodelz/agent/pkg/runtime"
@@ -78,6 +86,7 @@ func (s *Server) initKubernetesResources() error {
 	}
 
 	pods := kubeInformerFactory.Core().V1().Pods()
+	s.podStartWatch(pods, kubeClient)
 	go pods.Informer().Run(stopCh)
 	if ok := cache.WaitForNamedCacheSync(
 		fmt.Sprintf("%s:pods", consts.ProviderName),
@@ -120,3 +129,67 @@ func (s *Server) initKubernetesResources() error {
 	}
 	return nil
 }
+
+// podStartWatch log event when pod start began and finished
+func (s *Server) podStartWatch(pods kubeinformersv1.PodInformer, client *kubernetes.Clientset) {
+	pods.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+		AddFunc: func(obj interface{}) {
+			new := obj.(*v1.Pod)
+			controlPlane, exist := new.Annotations[consts.AnnotationControlPlaneKey]
+			// for inference created by modelz apiserver
+			if !exist || controlPlane != consts.ModelzAnnotationValue {
+				return
+			}
+			podWatchEventLog(s.eventRecorder, new, types.PodCreateEvent)
+			start := time.Now()
+
+			// Ticker will keep watching until pod start or timeout
+			ticker := time.NewTicker(time.Second * 2)
+			timeout := time.After(5 * time.Minute)
+			go func() {
+				for {
+					select {
+					case <-timeout:
+						podWatchEventLog(s.eventRecorder, new, types.PodTimeoutEvent)
+						return
+					case <-ticker.C:
+						pod, err := client.CoreV1().Pods(new.Namespace).Get(context.TODO(), new.Name, metav1.GetOptions{})
+						if err != nil {
+							logrus.WithFields(logrus.Fields{
+								"namespace":  pod.Namespace,
+								"deployment": pod.Labels["app"],
+								"name":       pod.Name,
+							}).Errorf("failed to get pod: %s", err)
+							return
+						}
+						for _, c := range pod.Status.Conditions {
+							if c.Type == v1.PodReady && c.Status == v1.ConditionTrue {
+								podWatchEventLog(s.eventRecorder, pod, types.PodReadyEvent)
+								label := prometheus.Labels{
+									"inference_name": fmt.Sprintf("%s.%s", pod.Labels["app"], pod.Namespace),
+									"source_image":   pod.Annotations[consts.AnnotationDockerImage]}
+								s.metricsOptions.PodStartHistogram.With(label).
+									Observe(time.Since(start).Seconds())
+								return
+							}
+						}
+					}
+				}
+			}()
+		},
+	})
+}
+
+// log status for pod watch status transfer
+func podWatchEventLog(recorder event.Interface, obj *v1.Pod, event string) {
+	deployment := obj.Labels["app"]
+	err := recorder.CreateDeploymentEvent(obj.Namespace, deployment, event, obj.Name)
+	if err != nil {
+		logrus.WithFields(logrus.Fields{
+			"namespace":  obj.Namespace,
+			"deployment": deployment,
+			"name":       obj.Name,
+			"event":      event,
+		}).Errorf("failed to create deployment event: %s", err)
+	}
+}
diff --git a/ingress-operator/pkg/consts/consts.go b/ingress-operator/pkg/consts/consts.go
@@ -1,8 +1,6 @@
 package consts
 
 const (
-	KeyCert                   = "cert"
-	EnvironmentPrefix         = "MODELZ"
-	AnnotationControlPlaneKey = "ai.tensorchord.control-plane"
-	ModelzAnnotationValue     = "modelz"
+	KeyCert           = "cert"
+	EnvironmentPrefix = "MODELZ"
 )
diff --git a/ingress-operator/pkg/controller/core.go b/ingress-operator/pkg/controller/core.go
@@ -8,13 +8,6 @@ import (
 	"time"
 
 	"github.com/google/go-cmp/cmp"
-	faasv1 "github.com/tensorchord/openmodelz/ingress-operator/pkg/apis/modelzetes/v1"
-	"github.com/tensorchord/openmodelz/ingress-operator/pkg/client/clientset/versioned/scheme"
-	faasscheme "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/clientset/versioned/scheme"
-	v1 "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/informers/externalversions/modelzetes/v1"
-	listers "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/listers/modelzetes/v1"
-	"github.com/tensorchord/openmodelz/ingress-operator/pkg/consts"
-	mdzconsts "github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime/schema"
@@ -27,6 +20,13 @@ import (
 	"k8s.io/client-go/tools/record"
 	"k8s.io/client-go/util/workqueue"
 	klog "k8s.io/klog"
+
+	faasv1 "github.com/tensorchord/openmodelz/ingress-operator/pkg/apis/modelzetes/v1"
+	"github.com/tensorchord/openmodelz/ingress-operator/pkg/client/clientset/versioned/scheme"
+	faasscheme "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/clientset/versioned/scheme"
+	v1 "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/informers/externalversions/modelzetes/v1"
+	listers "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/listers/modelzetes/v1"
+	"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
 )
 
 const AgentName = "ingress-operator"
@@ -257,7 +257,7 @@ func MakeAnnotations(fni *faasv1.InferenceIngress, host string) map[string]strin
 	annotations := make(map[string]string)
 
 	annotations["ai.tensorchord.spec"] = string(specJSON)
-	inferenceNamespace := fni.Labels[mdzconsts.LabelInferenceNamespace]
+	inferenceNamespace := fni.Labels[consts.LabelInferenceNamespace]
 
 	if !fni.Spec.BypassGateway {
 		switch class {

diff --git a/modelzetes/pkg/consts/consts.go b/modelzetes/pkg/consts/consts.go
@@ -5,8 +5,16 @@ const (
 
 	LabelInferenceName      = "inference"
 	LabelInferenceNamespace = "inference-namespace"
+	LabelBuildName          = "ai.tensorchord.build"
+	LabelName               = "ai.tensorchord.name"
+	LabelNamespace          = "modelz.tensorchord.ai/namespace"
+	LabelServerResource     = "ai.tensorchord.server-resource"
 
-	AnnotationBuilding = "ai.tensorchord.building"
+	AnnotationBuilding        = "ai.tensorchord.building"
+	AnnotationDockerImage     = "ai.tensorchord.docker.image"
+	AnnotationControlPlaneKey = "ai.tensorchord.control-plane"
+
+	ModelzAnnotationValue = "modelz"
 
 	TolerationGPU              = "ai.tensorchord.gpu"
 	TolerationNvidiaGPUPresent = "nvidia.com/gpu"