Skip to content

Commit

Permalink
Implement KatibConfig API
Browse files Browse the repository at this point in the history
Signed-off-by: Yuki Iwai <[email protected]>
  • Loading branch information
tenzen-y committed Jul 20, 2023
1 parent 067c119 commit 07e7a32
Show file tree
Hide file tree
Showing 27 changed files with 1,297 additions and 595 deletions.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ TEST_TENSORFLOW_EVENT_FILE_PATH ?= $(CURDIR)/test/unit/v1beta1/metricscollector/
# Run tests
.PHONY: test
test: envtest
KUBEBUILDER_ASSETS="$(shell setup-envtest --arch=amd64 use $(ENVTEST_K8S_VERSION) -p path)" go test ./pkg/... ./cmd/... -coverprofile coverage.out
# KUBEBUILDER_ASSETS="$(shell setup-envtest --arch=amd64 use $(ENVTEST_K8S_VERSION) -p path)" go test ./pkg/... ./cmd/... -coverprofile coverage.out
KUBEBUILDER_ASSETS="$(shell setup-envtest use $(ENVTEST_K8S_VERSION) -p path)" go test ./pkg/... ./cmd/... -coverprofile coverage.out

envtest:
ifndef HAS_SETUP_ENVTEST
Expand Down
84 changes: 43 additions & 41 deletions cmd/katib-controller/v1beta1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"os"

"github.com/spf13/viper"
"k8s.io/apimachinery/pkg/runtime"
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
"sigs.k8s.io/controller-runtime/pkg/client/config"
"sigs.k8s.io/controller-runtime/pkg/healthz"
Expand All @@ -32,38 +33,32 @@ import (
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/manager/signals"

configapi "github.com/kubeflow/katib/pkg/apis/config/v1beta1"
apis "github.com/kubeflow/katib/pkg/apis/controller"
controller "github.com/kubeflow/katib/pkg/controller.v1beta1"
"github.com/kubeflow/katib/pkg/controller.v1beta1"
"github.com/kubeflow/katib/pkg/controller.v1beta1/consts"
trialutil "github.com/kubeflow/katib/pkg/controller.v1beta1/trial/util"
"github.com/kubeflow/katib/pkg/util/v1beta1/katibconfig"
webhook "github.com/kubeflow/katib/pkg/webhook/v1beta1"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
)

var scheme = runtime.NewScheme()

func init() {
utilruntime.Must(apis.AddToScheme(scheme))
utilruntime.Must(configapi.AddToScheme(scheme))
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
}

func main() {
logf.SetLogger(zap.New())
log := logf.Log.WithName("entrypoint")

var experimentSuggestionName string
var metricsAddr string
var healthzAddr string
var webhookPort int
var injectSecurityContext bool
var enableGRPCProbeInSuggestion bool
var trialResources trialutil.GvkListFlag
var enableLeaderElection bool
var leaderElectionID string

flag.StringVar(&experimentSuggestionName, "experiment-suggestion-name",
"default", "The implementation of suggestion interface in experiment controller (default)")
flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&healthzAddr, "healthz-addr", ":18080", "The address the healthz endpoint binds to.")
flag.BoolVar(&injectSecurityContext, "webhook-inject-securitycontext", false, "Inject the securityContext of container[0] in the sidecar")
flag.BoolVar(&enableGRPCProbeInSuggestion, "enable-grpc-probe-in-suggestion", true, "enable grpc probe in suggestions")
flag.Var(&trialResources, "trial-resources", "The list of resources that can be used as trial template, in the form: Kind.version.group (e.g. TFJob.v1.kubeflow.org)")
flag.IntVar(&webhookPort, "webhook-port", 8443, "The port number to be used for admission webhook server.")
// For leader election
flag.BoolVar(&enableLeaderElection, "enable-leader-election", false, "Enable leader election for katib-controller. Enabling this will ensure there is only one active katib-controller.")
flag.StringVar(&leaderElectionID, "leader-election-id", "3fbc96e9.katib.kubeflow.org", "The ID for leader election.")
var katibConfigFile string
flag.StringVar(&katibConfigFile, "katib-config", "",
"The katib-controller will load its initial configuration from this file. "+
"Omit this flag to use the default configuration values. ")

// TODO (andreyvelich): Currently it is not possible to set different webhook service name.
// flag.StringVar(&serviceName, "webhook-service-name", "katib-controller", "The service name which will be used in webhook")
Expand All @@ -72,21 +67,33 @@ func main() {

flag.Parse()

inintConfig, err := katibconfig.GetInitConfigData(scheme, katibConfigFile)
if err != nil {
log.Error(err, "Failed to get KatibConfig")
os.Exit(1)
}

// Set the config in viper.
viper.Set(consts.ConfigExperimentSuggestionName, experimentSuggestionName)
viper.Set(consts.ConfigInjectSecurityContext, injectSecurityContext)
viper.Set(consts.ConfigEnableGRPCProbeInSuggestion, enableGRPCProbeInSuggestion)
viper.Set(consts.ConfigTrialResources, trialResources)
viper.Set(consts.ConfigExperimentSuggestionName, inintConfig.ControllerConfig.ExperimentSuggestionName)
viper.Set(consts.ConfigInjectSecurityContext, inintConfig.ControllerConfig.InjectSecurityContext)
viper.Set(consts.ConfigEnableGRPCProbeInSuggestion, inintConfig.ControllerConfig.EnableGRPCProbeInSuggestion)

trialGVKs, err := configapi.TrialResourcesToGVKs(inintConfig.ControllerConfig.TrialResources)
if err != nil {
log.Error(err, "Failed to parse trialResources")
os.Exit(1)
}
viper.Set(consts.ConfigTrialResources, trialGVKs)

log.Info("Config:",
consts.ConfigExperimentSuggestionName,
viper.GetString(consts.ConfigExperimentSuggestionName),
"webhook-port",
webhookPort,
inintConfig.ControllerConfig.WebhookPort,
"metrics-addr",
metricsAddr,
inintConfig.ControllerConfig.MetricsAddr,
"healthz-addr",
healthzAddr,
inintConfig.ControllerConfig.HealthzAddr,
consts.ConfigInjectSecurityContext,
viper.GetBool(consts.ConfigInjectSecurityContext),
consts.ConfigEnableGRPCProbeInSuggestion,
Expand All @@ -104,10 +111,11 @@ func main() {

// Create a new katib controller to provide shared dependencies and start components
mgr, err := manager.New(cfg, manager.Options{
MetricsBindAddress: metricsAddr,
HealthProbeBindAddress: healthzAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionID,
MetricsBindAddress: inintConfig.ControllerConfig.MetricsAddr,
HealthProbeBindAddress: inintConfig.ControllerConfig.HealthzAddr,
LeaderElection: *inintConfig.ControllerConfig.EnableLeaderElection,
LeaderElectionID: inintConfig.ControllerConfig.LeaderElectionID,
Scheme: scheme,
})
if err != nil {
log.Error(err, "Failed to create the manager")
Expand All @@ -116,12 +124,6 @@ func main() {

log.Info("Registering Components.")

// Setup Scheme for all resources
if err := apis.AddToScheme(mgr.GetScheme()); err != nil {
log.Error(err, "Unable to add APIs to scheme")
os.Exit(1)
}

// Setup all Controllers
log.Info("Setting up controller.")
if err := controller.AddToManager(mgr); err != nil {
Expand All @@ -130,7 +132,7 @@ func main() {
}

log.Info("Setting up webhooks.")
if err := webhook.AddToManager(mgr, webhookPort); err != nil {
if err := webhook.AddToManager(mgr, *inintConfig.ControllerConfig.WebhookPort); err != nil {
log.Error(err, "Unable to register webhooks to the manager")
os.Exit(1)
}
Expand Down
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ require (
github.com/go-sql-driver/mysql v1.5.0
github.com/golang/mock v1.6.0
github.com/golang/protobuf v1.5.2
github.com/google/go-cmp v0.5.9
github.com/google/go-containerregistry v0.9.0
github.com/google/go-containerregistry/pkg/authn/k8schain v0.0.0-20211222182933-7c19fa370dbd
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
Expand All @@ -24,12 +25,14 @@ require (
github.com/tidwall/gjson v1.14.1
golang.org/x/net v0.5.0
google.golang.org/grpc v1.53.0
gopkg.in/yaml.v3 v3.0.1
k8s.io/api v0.25.3
k8s.io/apimachinery v0.25.3
k8s.io/client-go v0.25.3
k8s.io/code-generator v0.25.3
k8s.io/klog v1.0.0
k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1
k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed
sigs.k8s.io/controller-runtime v0.13.0
)

Expand Down Expand Up @@ -69,7 +72,6 @@ require (
github.com/golang-jwt/jwt/v4 v4.2.0 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/google/gnostic v0.5.7-v3refs // indirect
github.com/google/go-cmp v0.5.9 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
Expand Down Expand Up @@ -129,15 +131,13 @@ require (
gopkg.in/ini.v1 v1.63.2 // indirect
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
gotest.tools/v3 v3.1.0 // indirect
k8s.io/apiextensions-apiserver v0.25.0 // indirect
k8s.io/cloud-provider v0.21.0 // indirect
k8s.io/component-base v0.25.0 // indirect
k8s.io/gengo v0.0.0-20211129171323-c02415ce4185 // indirect
k8s.io/klog/v2 v2.70.1 // indirect
k8s.io/legacy-cloud-providers v0.21.0 // indirect
k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed // indirect
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
Expand Down
15 changes: 8 additions & 7 deletions manifests/v1beta1/components/controller/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,7 @@ spec:
image: docker.io/kubeflowkatib/katib-controller
command: ["./katib-controller"]
args:
- "--webhook-port=8443"
- "--trial-resources=Job.v1.batch"
- "--trial-resources=TFJob.v1.kubeflow.org"
- "--trial-resources=PyTorchJob.v1.kubeflow.org"
- "--trial-resources=MPIJob.v1.kubeflow.org"
- "--trial-resources=XGBoostJob.v1.kubeflow.org"
- "--trial-resources=MXJob.v1.kubeflow.org"
- "--katib-config=katib-config.yaml"
ports:
- containerPort: 8443
name: webhook
Expand Down Expand Up @@ -60,8 +54,15 @@ spec:
- mountPath: /tmp/cert
name: cert
readOnly: true
- mountPath: /katib-config.yaml
name: katib-config
subPath: katib-config.yaml
readOnly: true
volumes:
- name: cert
secret:
defaultMode: 420
secretName: katib-webhook-cert
- name: katib-config
configMap:
name: katib-config
139 changes: 58 additions & 81 deletions manifests/v1beta1/components/controller/katib-config.yaml
Original file line number Diff line number Diff line change
@@ -1,81 +1,58 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: katib-config
namespace: kubeflow
data:
metrics-collector-sidecar: |-
{
"StdOut": {
"image": "docker.io/kubeflowkatib/file-metrics-collector:latest"
},
"File": {
"image": "docker.io/kubeflowkatib/file-metrics-collector:latest"
},
"TensorFlowEvent": {
"image": "docker.io/kubeflowkatib/tfevent-metrics-collector:latest",
"resources": {
"limits": {
"memory": "1Gi"
}
}
}
}
suggestion: |-
{
"random": {
"image": "docker.io/kubeflowkatib/suggestion-hyperopt:latest"
},
"tpe": {
"image": "docker.io/kubeflowkatib/suggestion-hyperopt:latest"
},
"grid": {
"image": "docker.io/kubeflowkatib/suggestion-optuna:latest"
},
"hyperband": {
"image": "docker.io/kubeflowkatib/suggestion-hyperband:latest"
},
"bayesianoptimization": {
"image": "docker.io/kubeflowkatib/suggestion-skopt:latest"
},
"cmaes": {
"image": "docker.io/kubeflowkatib/suggestion-goptuna:latest"
},
"sobol": {
"image": "docker.io/kubeflowkatib/suggestion-goptuna:latest"
},
"multivariate-tpe": {
"image": "docker.io/kubeflowkatib/suggestion-optuna:latest"
},
"enas": {
"image": "docker.io/kubeflowkatib/suggestion-enas:latest",
"resources": {
"limits": {
"memory": "200Mi"
}
}
},
"darts": {
"image": "docker.io/kubeflowkatib/suggestion-darts:latest"
},
"pbt": {
"image": "docker.io/kubeflowkatib/suggestion-pbt:latest",
"persistentVolumeClaimSpec": {
"accessModes": [
"ReadWriteMany"
],
"resources": {
"requests": {
"storage": "5Gi"
}
}
}
}
}
early-stopping: |-
{
"medianstop": {
"image": "docker.io/kubeflowkatib/earlystopping-medianstop:latest"
}
}
apiVersion: config.katib.kubeflow.org/v1beta1

Check warning on line 1 in manifests/v1beta1/components/controller/katib-config.yaml

View workflow job for this annotation

GitHub Actions / Lint

1:1 [document-start] missing document start "---"
kind: KatibConfig
init:
controller:
webhookPort: 8443
trialResources:
- Job.v1.batch

Check failure on line 7 in manifests/v1beta1/components/controller/katib-config.yaml

View workflow job for this annotation

GitHub Actions / Lint

7:5 [indentation] wrong indentation: expected 6 but found 4
- TFJob.v1.kubeflow.org
- PyTorchJob.v1.kubeflow.org
- MPIJob.v1.kubeflow.org
- XGBoostJob.v1.kubeflow.org
- MXJob.v1.kubeflow.org
runtime:
metricsCollectorSidecars:
- collectorKind: StdOut

Check failure on line 15 in manifests/v1beta1/components/controller/katib-config.yaml

View workflow job for this annotation

GitHub Actions / Lint

15:3 [indentation] wrong indentation: expected 4 but found 2
image: docker.io/kubeflowkatib/file-metrics-collector:latest
- collectorKind: File
image: docker.io/kubeflowkatib/file-metrics-collector:latest
- collectorKind: TensorFlowEvent
image: docker.io/kubeflowkatib/tfevent-metrics-collector:latest
resources:
limits:
memory: 1Gi
suggestions:
- algorithmName: random

Check failure on line 25 in manifests/v1beta1/components/controller/katib-config.yaml

View workflow job for this annotation

GitHub Actions / Lint

25:3 [indentation] wrong indentation: expected 4 but found 2
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
- algorithmName: tpe
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
- algorithmName: grid
image: docker.io/kubeflowkatib/suggestion-optuna:latest
- algorithmName: hyperband
image: docker.io/kubeflowkatib/suggestion-hyperband:latest
- algorithmName: bayesianoptimization
image: docker.io/kubeflowkatib/suggestion-skopt:latest
- algorithmName: cmaes
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
- algorithmName: sobol
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
- algorithmName: multivariate-tpe
image: docker.io/kubeflowkatib/suggestion-optuna:latest
- algorithmName: enas
image: docker.io/kubeflowkatib/suggestion-enas:latest
resources:
limits:
memory: 200Mi
- algorithmName: darts
image: docker.io/kubeflowkatib/suggestion-darts:latest
- algorithmName: pbt
image: docker.io/kubeflowkatib/suggestion-pbt:latest
persistentVolumeClaimSpec:
accessModes:
- ReadWriteMany

Check failure on line 52 in manifests/v1beta1/components/controller/katib-config.yaml

View workflow job for this annotation

GitHub Actions / Lint

52:7 [indentation] wrong indentation: expected 8 but found 6
resources:
requests:
storage: 5Gi
earlyStoppings:
- algorithmName: medianstop

Check failure on line 57 in manifests/v1beta1/components/controller/katib-config.yaml

View workflow job for this annotation

GitHub Actions / Lint

57:3 [indentation] wrong indentation: expected 4 but found 2
image: docker.io/kubeflowkatib/earlystopping-medianstop:latest
5 changes: 4 additions & 1 deletion manifests/v1beta1/components/controller/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ kind: Kustomization

resources:
- controller.yaml
- katib-config.yaml
- rbac.yaml
- service.yaml
- trial-templates.yaml
configMapGenerator:
- name: katib-config
files:
- katib-config.yaml

Check failure on line 13 in manifests/v1beta1/components/controller/kustomization.yaml

View workflow job for this annotation

GitHub Actions / Lint

13:5 [indentation] wrong indentation: expected 6 but found 4
Loading

0 comments on commit 07e7a32

Please sign in to comment.