From 66b0515421de74ef4f813f8d85a17126d5335dd8 Mon Sep 17 00:00:00 2001 From: Dom Delnano Date: Mon, 13 May 2024 10:31:47 -0700 Subject: [PATCH] Revert change that causes `px deploy` to timeout on initial deployment (#1899) Summary: Revert change that causes `px deploy` to timeout on initial deployment This reverts #1670, which creates a circular dependency between the operator and the cloud connector service. The cloud connector service is responsible for [registering a vizier](https://github.com/pixie-io/pixie/blob/fb18345808e6fbd8aed02dac8163469322b84205/src/vizier/services/cloud_connector/bridge/server.go#L290-L302) and populating the `pl-cluster-secrets` k8s secret with its vizier ID. The operator is responsible for creating the vizier services (including the cloud connector), so [this call](https://github.com/pixie-io/pixie/blob/fb18345808e6fbd8aed02dac8163469322b84205/src/operator/controllers/vizier_controller.go#L503) will never succeed on a fresh install. This bug doesn't cause pixie installs to fail completely. A `px deploy` cli command will time out after 10 minutes and then the vizier is deployed following that timeout expiration. While it eventually converges to a healthy vizier, this is a poor user experience. The perf tool is also experiencing this problem, but because it requires `px deploy` to return a successful status code it is causing it to fail completely. Relevant Issues: Reopens https://github.com/pixie-io/pixie/issues/1632 Type of change: /kind bug Test Plan: Reverted this change and verified that the a `skaffold`'ed operator doesn't timeout Signed-off-by: Dom Del Nano --- src/operator/controllers/BUILD.bazel | 2 - src/operator/controllers/vizier_controller.go | 44 +------------------ 2 files changed, 2 insertions(+), 44 deletions(-) diff --git a/src/operator/controllers/BUILD.bazel b/src/operator/controllers/BUILD.bazel index 7a53c63a8ab..9cf8d7ea59d 100644 --- a/src/operator/controllers/BUILD.bazel +++ b/src/operator/controllers/BUILD.bazel @@ -29,13 +29,11 @@ go_library( visibility = ["//visibility:public"], deps = [ "//src/api/proto/cloudpb:cloudapi_pl_go_proto", - "//src/api/proto/uuidpb:uuid_pl_go_proto", "//src/api/proto/vizierconfigpb:vizier_pl_go_proto", "//src/operator/apis/px.dev/v1alpha1", "//src/shared/goversion", "//src/shared/services", "//src/shared/status", - "//src/utils", "//src/utils/shared/certs", "//src/utils/shared/k8s", "@com_github_blang_semver//:semver", diff --git a/src/operator/controllers/vizier_controller.go b/src/operator/controllers/vizier_controller.go index ae2e1dfb159..364bd636193 100644 --- a/src/operator/controllers/vizier_controller.go +++ b/src/operator/controllers/vizier_controller.go @@ -48,13 +48,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "px.dev/pixie/src/api/proto/cloudpb" - "px.dev/pixie/src/api/proto/uuidpb" "px.dev/pixie/src/api/proto/vizierconfigpb" "px.dev/pixie/src/operator/apis/px.dev/v1alpha1" version "px.dev/pixie/src/shared/goversion" "px.dev/pixie/src/shared/services" "px.dev/pixie/src/shared/status" - "px.dev/pixie/src/utils" "px.dev/pixie/src/utils/shared/certs" "px.dev/pixie/src/utils/shared/k8s" ) @@ -499,13 +497,7 @@ func (r *VizierReconciler) deployVizier(ctx context.Context, req ctrl.Request, v return err } - // Get the Vizier's ID from the cluster's secrets. - vizierID, err := getVizierID(r.Clientset, req.Namespace) - if err != nil { - log.WithError(err).Error("Failed to retrieve the Vizier ID from the cluster's secrets") - } - - configForVizierResp, err := generateVizierYAMLsConfig(ctx, req.Namespace, r.K8sVersion, vizierID, vz, cloudClient) + configForVizierResp, err := generateVizierYAMLsConfig(ctx, req.Namespace, r.K8sVersion, vz, cloudClient) if err != nil { log.WithError(err).Error("Failed to generate configs for Vizier YAMLs") return err @@ -817,14 +809,13 @@ func convertResourceType(originalLst v1.ResourceList) *vizierconfigpb.ResourceLi // generateVizierYAMLsConfig is responsible retrieving a yaml map of configurations from // Pixie Cloud. -func generateVizierYAMLsConfig(ctx context.Context, ns string, k8sVersion string, vizierID *uuidpb.UUID, vz *v1alpha1.Vizier, conn *grpc.ClientConn) (*cloudpb.ConfigForVizierResponse, +func generateVizierYAMLsConfig(ctx context.Context, ns string, k8sVersion string, vz *v1alpha1.Vizier, conn *grpc.ClientConn) (*cloudpb.ConfigForVizierResponse, error) { client := cloudpb.NewConfigServiceClient(conn) req := &cloudpb.ConfigForVizierRequest{ Namespace: ns, K8sVersion: k8sVersion, - VizierID: vizierID, VzSpec: &vizierconfigpb.VizierSpec{ Version: vz.Spec.Version, DeployKey: vz.Spec.DeployKey, @@ -1143,37 +1134,6 @@ func getClusterUID(clientset *kubernetes.Clientset) (string, error) { return string(ksNS.UID), nil } -// getVizierID gets the ID of the cluster the Vizier is in. -func getVizierID(clientset *kubernetes.Clientset, namespace string) (*uuidpb.UUID, error) { - op := func() (*uuidpb.UUID, error) { - var vizierID *uuidpb.UUID - s := k8s.GetSecret(clientset, namespace, "pl-cluster-secrets") - if s == nil { - return nil, errors.New("Missing cluster secrets, retrying again") - } - if id, ok := s.Data["cluster-id"]; ok { - vizierID = utils.ProtoFromUUIDStrOrNil(string(id)) - if vizierID == nil { - return nil, errors.New("Couldn't convert ID to proto") - } - } - - return vizierID, nil - } - - expBackoff := backoff.NewExponentialBackOff() - expBackoff.InitialInterval = 10 * time.Second - expBackoff.Multiplier = 2 - expBackoff.MaxElapsedTime = 10 * time.Minute - - vizierID, err := backoff.RetryWithData(op, expBackoff) - if err != nil { - return nil, errors.New("Timed out waiting for the Vizier ID") - } - - return vizierID, nil -} - // getConfigForOperator is responsible retrieving the Operator config from from Pixie Cloud. func getConfigForOperator(ctx context.Context, conn *grpc.ClientConn) (*cloudpb.ConfigForOperatorResponse, error) { client := cloudpb.NewConfigServiceClient(conn)