Skip to content
This repository has been archived by the owner on Jul 17, 2024. It is now read-only.

Commit

Permalink
Merge remote-tracking branch 'origin/main' into node-info-resource-type
Browse files Browse the repository at this point in the history
  • Loading branch information
xieydd committed Oct 24, 2023
2 parents be60274 + dec5979 commit 6588898
Show file tree
Hide file tree
Showing 16 changed files with 142 additions and 57 deletions.
6 changes: 0 additions & 6 deletions agent/api/types/const.go

This file was deleted.

3 changes: 3 additions & 0 deletions agent/api/types/event.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ const (
DeploymentScaleUpEvent = "deployment-scale-up"
DeploymentScaleDownEvent = "deployment-scale-down"
DeploymentScaleBlockEvent = "deployment-scale-block"
PodCreateEvent = "pod-create"
PodReadyEvent = "pod-ready"
PodTimeoutEvent = "pod-timeout"
)

type DeploymentEvent struct {
Expand Down
6 changes: 0 additions & 6 deletions agent/pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,6 @@ package consts
import "time"

const (
LabelBuildName = "ai.tensorchord.build"
LabelName = "ai.tensorchord.name"
LabelServerResource = "ai.tensorchord.server-resource"
AnnotationControlPlaneKey = "ai.tensorchord.control-plane"
ModelzAnnotationValue = "modelz"

Domain = "modelz.live"
DefaultPrefix = "modelz-"
APIKEY_PREFIX = "mzi-"
Expand Down
38 changes: 21 additions & 17 deletions agent/pkg/k8s/convert_inference.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,23 +58,7 @@ func AsInferenceDeployment(inf *v2alpha1.Inference, item *appsv1.Deployment) *ty
res.Status.CreatedAt = &item.CreationTimestamp.Time
res.Status.InvocationCount = 0
res.Status.AvailableReplicas = item.Status.AvailableReplicas

res.Status.Phase = types.PhaseNotReady
for _, c := range item.Status.Conditions {
if c.Type == appsv1.DeploymentAvailable && c.Status == v1.ConditionTrue {
res.Status.Phase = types.PhaseReady
} else if c.Type == appsv1.DeploymentProgressing && c.Status == v1.ConditionFalse {
res.Status.Phase = types.PhaseScaling
}
}

if item.Spec.Replicas != nil && *item.Spec.Replicas == 0 {
res.Status.Phase = types.PhaseNoReplicas
}

if item.DeletionTimestamp != nil {
res.Status.Phase = types.PhaseTerminating
}
res.Status.Phase = AsStatusPhase(item)
}
return res
}
Expand All @@ -98,3 +82,23 @@ func AsResourceList(resources v1.ResourceList) types.ResourceList {
}
return res
}

func AsStatusPhase(item *appsv1.Deployment) types.Phase {
phase := types.PhaseNotReady
for _, c := range item.Status.Conditions {
if c.Type == appsv1.DeploymentAvailable && c.Status == v1.ConditionTrue {
phase = types.PhaseReady
} else if c.Type == appsv1.DeploymentProgressing && c.Status == v1.ConditionFalse {
phase = types.PhaseScaling
}
}

if item.Spec.Replicas != nil && *item.Spec.Replicas == 0 {
phase = types.PhaseNoReplicas
}

if item.DeletionTimestamp != nil {
phase = types.PhaseTerminating
}
return phase
}
2 changes: 1 addition & 1 deletion agent/pkg/k8s/generate_image_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ import (

kubefledged "github.com/senthilrch/kube-fledged/pkg/apis/kubefledged/v1alpha3"
"github.com/tensorchord/openmodelz/agent/api/types"
"github.com/tensorchord/openmodelz/agent/pkg/consts"
modelzetes "github.com/tensorchord/openmodelz/modelzetes/pkg/apis/modelzetes/v2alpha1"
"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
Expand Down
7 changes: 3 additions & 4 deletions agent/pkg/k8s/generate_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ import (
"k8s.io/apimachinery/pkg/runtime/schema"

"github.com/tensorchord/openmodelz/agent/api/types"
"github.com/tensorchord/openmodelz/agent/pkg/consts"
"github.com/tensorchord/openmodelz/modelzetes/pkg/apis/modelzetes/v2alpha1"
mzconsts "github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
)

func MakeBuild(req types.Build, inference *v2alpha1.Inference, builderImage, buildkitdAddr, buildctlBin, secret string) (*batchv1.Job, error) {
Expand Down Expand Up @@ -70,8 +69,8 @@ func MakeBuild(req types.Build, inference *v2alpha1.Inference, builderImage, bui
Namespace: req.Spec.Namespace,
OwnerReferences: ownerReference,
Labels: map[string]string{
consts.LabelBuildName: req.Spec.Name,
mzconsts.AnnotationBuilding: "true",
consts.LabelBuildName: req.Spec.Name,
consts.AnnotationBuilding: "true",
},
},
Spec: batchv1.JobSpec{
Expand Down
2 changes: 2 additions & 0 deletions agent/pkg/metrics/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
e.metricOptions.ServiceAvailableReplicasGauge.Reset()
e.metricOptions.ServiceTargetLoad.Reset()

e.metricOptions.PodStartHistogram.Collect(ch)

for _, service := range e.services {
var serviceName string
if len(service.Spec.Namespace) > 0 {
Expand Down
9 changes: 9 additions & 0 deletions agent/pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ type MetricOptions struct {
ServiceReplicasGauge *prometheus.GaugeVec
ServiceAvailableReplicasGauge *prometheus.GaugeVec
ServiceTargetLoad *prometheus.GaugeVec

PodStartHistogram *prometheus.HistogramVec
}

// ServiceMetricOptions provides RED metrics
Expand Down Expand Up @@ -108,6 +110,12 @@ func BuildMetricsOptions() MetricOptions {
[]string{"inference_name"},
)

podStartHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "pod_start_seconds",
Help: "Pod start time taken",
Buckets: prometheus.ExponentialBuckets(8, 1.5, 10),
}, []string{"inference_name", "source_image"})

metricsOptions := MetricOptions{
GatewayInferencesHistogram: gatewayInferencesHistogram,
GatewayInferenceInvocation: gatewayInferenceInvocation,
Expand All @@ -116,6 +124,7 @@ func BuildMetricsOptions() MetricOptions {
ServiceTargetLoad: serviceTargetLoad,
GatewayInferenceInvocationStarted: gatewayInferenceInvocationStarted,
GatewayInferenceInvocationInflight: gatewayInferenceInvocationInflight,
PodStartHistogram: podStartHistogram,
}

return metricsOptions
Expand Down
12 changes: 6 additions & 6 deletions agent/pkg/runtime/inference_create.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@ import (
"context"
"fmt"

ingressv1 "github.com/tensorchord/openmodelz/ingress-operator/pkg/apis/modelzetes/v1"
v2alpha1 "github.com/tensorchord/openmodelz/modelzetes/pkg/apis/modelzetes/v2alpha1"
"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
v1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -15,6 +12,9 @@ import (
"github.com/tensorchord/openmodelz/agent/errdefs"
"github.com/tensorchord/openmodelz/agent/pkg/config"
localconsts "github.com/tensorchord/openmodelz/agent/pkg/consts"
ingressv1 "github.com/tensorchord/openmodelz/ingress-operator/pkg/apis/modelzetes/v1"
v2alpha1 "github.com/tensorchord/openmodelz/modelzetes/pkg/apis/modelzetes/v2alpha1"
"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
)

func (r generalRuntime) InferenceCreate(ctx context.Context,
Expand All @@ -37,7 +37,7 @@ func (r generalRuntime) InferenceCreate(ctx context.Context,
// Create the ingress
// TODO(gaocegege): Check if the domain is already used.
if r.ingressEnabled {
name := req.Spec.Labels[localconsts.LabelName]
name := req.Spec.Labels[consts.LabelName]

if r.ingressAnyIPToDomain {
// Get the service with type=loadbalancer.
Expand Down Expand Up @@ -208,8 +208,8 @@ func makeIngress(request types.InferenceDeployment, cfg config.IngressConfig) (*
}

annotation := map[string]string{}
if value, exist := request.Spec.Annotations[localconsts.AnnotationControlPlaneKey]; exist {
annotation[localconsts.AnnotationControlPlaneKey] = value
if value, exist := request.Spec.Annotations[consts.AnnotationControlPlaneKey]; exist {
annotation[consts.AnnotationControlPlaneKey] = value
}
ingress.Annotations = annotation

Expand Down
6 changes: 3 additions & 3 deletions agent/pkg/runtime/namespace.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@ import (
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/tensorchord/openmodelz/agent/api/types"
"github.com/tensorchord/openmodelz/agent/errdefs"
"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
)

func (r generalRuntime) NamespaceList(ctx context.Context) ([]string, error) {
ns, err := r.kubeClient.CoreV1().Namespaces().List(ctx,
metav1.ListOptions{
LabelSelector: fmt.Sprintf("%s=true", types.LabelNamespace),
LabelSelector: fmt.Sprintf("%s=true", consts.LabelNamespace),
})
if err != nil {
if k8serrors.IsNotFound(err) {
Expand All @@ -37,7 +37,7 @@ func (r generalRuntime) NamespaceCreate(ctx context.Context, name string) error
ObjectMeta: metav1.ObjectMeta{
Name: name,
Labels: map[string]string{
types.LabelNamespace: "true",
consts.LabelNamespace: "true",
},
},
}
Expand Down
2 changes: 1 addition & 1 deletion agent/pkg/runtime/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import (
"context"

"github.com/sirupsen/logrus"
"github.com/tensorchord/openmodelz/agent/pkg/consts"
"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand Down
1 change: 1 addition & 0 deletions agent/pkg/runtime/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"

"github.com/gin-gonic/gin"

"github.com/sirupsen/logrus"
apicorev1 "k8s.io/api/core/v1"
appsv1 "k8s.io/client-go/informers/apps/v1"
Expand Down
73 changes: 73 additions & 0 deletions agent/pkg/server/server_init_kubernetes.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,23 @@
package server

import (
"context"
"fmt"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kubeinformers "k8s.io/client-go/informers"
kubeinformersv1 "k8s.io/client-go/informers/core/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/clientcmd"

kubefledged "github.com/senthilrch/kube-fledged/pkg/client/clientset/versioned"
"github.com/tensorchord/openmodelz/agent/api/types"
"github.com/tensorchord/openmodelz/agent/pkg/event"
"github.com/tensorchord/openmodelz/agent/pkg/k8s"
"github.com/tensorchord/openmodelz/agent/pkg/log"
"github.com/tensorchord/openmodelz/agent/pkg/runtime"
Expand Down Expand Up @@ -78,6 +86,7 @@ func (s *Server) initKubernetesResources() error {
}

pods := kubeInformerFactory.Core().V1().Pods()
s.podStartWatch(pods, kubeClient)
go pods.Informer().Run(stopCh)
if ok := cache.WaitForNamedCacheSync(
fmt.Sprintf("%s:pods", consts.ProviderName),
Expand Down Expand Up @@ -120,3 +129,67 @@ func (s *Server) initKubernetesResources() error {
}
return nil
}

// podStartWatch log event when pod start began and finished
func (s *Server) podStartWatch(pods kubeinformersv1.PodInformer, client *kubernetes.Clientset) {
pods.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
new := obj.(*v1.Pod)
controlPlane, exist := new.Annotations[consts.AnnotationControlPlaneKey]
// for inference created by modelz apiserver
if !exist || controlPlane != consts.ModelzAnnotationValue {
return
}
podWatchEventLog(s.eventRecorder, new, types.PodCreateEvent)
start := time.Now()

// Ticker will keep watching until pod start or timeout
ticker := time.NewTicker(time.Second * 2)
timeout := time.After(5 * time.Minute)
go func() {
for {
select {
case <-timeout:
podWatchEventLog(s.eventRecorder, new, types.PodTimeoutEvent)
return
case <-ticker.C:
pod, err := client.CoreV1().Pods(new.Namespace).Get(context.TODO(), new.Name, metav1.GetOptions{})
if err != nil {
logrus.WithFields(logrus.Fields{
"namespace": pod.Namespace,
"deployment": pod.Labels["app"],
"name": pod.Name,
}).Errorf("failed to get pod: %s", err)
return
}
for _, c := range pod.Status.Conditions {
if c.Type == v1.PodReady && c.Status == v1.ConditionTrue {
podWatchEventLog(s.eventRecorder, pod, types.PodReadyEvent)
label := prometheus.Labels{
"inference_name": fmt.Sprintf("%s.%s", pod.Labels["app"], pod.Namespace),
"source_image": pod.Annotations[consts.AnnotationDockerImage]}
s.metricsOptions.PodStartHistogram.With(label).
Observe(time.Since(start).Seconds())
return
}
}
}
}
}()
},
})
}

// log status for pod watch status transfer
func podWatchEventLog(recorder event.Interface, obj *v1.Pod, event string) {
deployment := obj.Labels["app"]
err := recorder.CreateDeploymentEvent(obj.Namespace, deployment, event, obj.Name)
if err != nil {
logrus.WithFields(logrus.Fields{
"namespace": obj.Namespace,
"deployment": deployment,
"name": obj.Name,
"event": event,
}).Errorf("failed to create deployment event: %s", err)
}
}
6 changes: 2 additions & 4 deletions ingress-operator/pkg/consts/consts.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package consts

const (
KeyCert = "cert"
EnvironmentPrefix = "MODELZ"
AnnotationControlPlaneKey = "ai.tensorchord.control-plane"
ModelzAnnotationValue = "modelz"
KeyCert = "cert"
EnvironmentPrefix = "MODELZ"
)
16 changes: 8 additions & 8 deletions ingress-operator/pkg/controller/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,6 @@ import (
"time"

"github.com/google/go-cmp/cmp"
faasv1 "github.com/tensorchord/openmodelz/ingress-operator/pkg/apis/modelzetes/v1"
"github.com/tensorchord/openmodelz/ingress-operator/pkg/client/clientset/versioned/scheme"
faasscheme "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/clientset/versioned/scheme"
v1 "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/informers/externalversions/modelzetes/v1"
listers "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/listers/modelzetes/v1"
"github.com/tensorchord/openmodelz/ingress-operator/pkg/consts"
mdzconsts "github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
Expand All @@ -27,6 +20,13 @@ import (
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
klog "k8s.io/klog"

faasv1 "github.com/tensorchord/openmodelz/ingress-operator/pkg/apis/modelzetes/v1"
"github.com/tensorchord/openmodelz/ingress-operator/pkg/client/clientset/versioned/scheme"
faasscheme "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/clientset/versioned/scheme"
v1 "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/informers/externalversions/modelzetes/v1"
listers "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/listers/modelzetes/v1"
"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
)

const AgentName = "ingress-operator"
Expand Down Expand Up @@ -257,7 +257,7 @@ func MakeAnnotations(fni *faasv1.InferenceIngress, host string) map[string]strin
annotations := make(map[string]string)

annotations["ai.tensorchord.spec"] = string(specJSON)
inferenceNamespace := fni.Labels[mdzconsts.LabelInferenceNamespace]
inferenceNamespace := fni.Labels[consts.LabelInferenceNamespace]

if !fni.Spec.BypassGateway {
switch class {
Expand Down
10 changes: 9 additions & 1 deletion modelzetes/pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,16 @@ const (

LabelInferenceName = "inference"
LabelInferenceNamespace = "inference-namespace"
LabelBuildName = "ai.tensorchord.build"
LabelName = "ai.tensorchord.name"
LabelNamespace = "modelz.tensorchord.ai/namespace"
LabelServerResource = "ai.tensorchord.server-resource"

AnnotationBuilding = "ai.tensorchord.building"
AnnotationBuilding = "ai.tensorchord.building"
AnnotationDockerImage = "ai.tensorchord.docker.image"
AnnotationControlPlaneKey = "ai.tensorchord.control-plane"

ModelzAnnotationValue = "modelz"

TolerationGPU = "ai.tensorchord.gpu"
TolerationNvidiaGPUPresent = "nvidia.com/gpu"
Expand Down

0 comments on commit 6588898

Please sign in to comment.