From 522562ed31fa6afc1f8fb8edd3871b96613fdd83 Mon Sep 17 00:00:00 2001 From: Riley Karson Date: Fri, 15 Sep 2023 17:20:45 -0700 Subject: [PATCH] Stop deleting failed clusters, taint instead --- .../resource_container_cluster.go.erb | 72 +++---------------- .../resource_container_cluster_test.go.erb | 15 ++-- 2 files changed, 17 insertions(+), 70 deletions(-) diff --git a/mmv1/third_party/terraform/services/container/resource_container_cluster.go.erb b/mmv1/third_party/terraform/services/container/resource_container_cluster.go.erb index d031f130dcd0..29bd46fee95a 100644 --- a/mmv1/third_party/terraform/services/container/resource_container_cluster.go.erb +++ b/mmv1/third_party/terraform/services/container/resource_container_cluster.go.erb @@ -2324,12 +2324,11 @@ func resourceContainerClusterCreate(d *schema.ResourceData, meta interface{}) er } // For now PSC based cluster don't support `enable_private_endpoint` on `create`, but only on `update` API call. - // If cluster is PSC based and enable_private_endpoint is set to true we will ignore it on `create` call and update cluster right after creation. + // If cluster is PSC based and enable_private_endpoint is set to true we will ignore it on `create` call and update cluster right after creation. enablePrivateEndpointPSCCluster := isEnablePrivateEndpointPSCCluster(cluster) if enablePrivateEndpointPSCCluster { cluster.PrivateClusterConfig.EnablePrivateEndpoint = false } - req := &container.CreateClusterRequest{ Cluster: cluster, @@ -2369,29 +2368,26 @@ func resourceContainerClusterCreate(d *schema.ResourceData, meta interface{}) er if err := d.Set("operation", op.Name); err != nil { return fmt.Errorf("Error setting operation: %s", err) } + return nil default: // leaving default case to ensure this is non blocking } + // Try a GET on the cluster so we can see the state in debug logs. This will help classify error states. clusterGetCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.Get(containerClusterFullName(project, location, clusterName)) if config.UserProjectOverride { clusterGetCall.Header().Add("X-Goog-User-Project", project) } + _, getErr := clusterGetCall.Do() if getErr != nil { log.Printf("[WARN] Cluster %s was created in an error state and not found", clusterName) d.SetId("") } - if deleteErr := cleanFailedContainerCluster(d, meta); deleteErr != nil { - log.Printf("[WARN] Unable to clean up cluster from failed creation: %s", deleteErr) - // Leave ID set as the cluster likely still exists and should not be removed from state yet. - } else { - log.Printf("[WARN] Verified failed creation of cluster %s was cleaned up", d.Id()) - d.SetId("") - } - // The resource didn't actually create + // Don't clear cluster id, this will taint the resource + log.Printf("[WARN] GKE cluster %s was created in an error state, and has been marked as tainted", clusterName) return waitErr } @@ -2536,14 +2532,8 @@ func resourceContainerClusterRead(d *schema.ResourceData, meta interface{}) erro d.SetId("") } - if deleteErr := cleanFailedContainerCluster(d, meta); deleteErr != nil { - log.Printf("[WARN] Unable to clean up cluster from failed creation: %s", deleteErr) - // Leave ID set as the cluster likely still exists and should not be removed from state yet. - } else { - log.Printf("[WARN] Verified failed creation of cluster %s was cleaned up", d.Id()) - d.SetId("") - } - // The resource didn't actually create + // Don't clear cluster id, this will taint the resource + log.Printf("[WARN] GKE cluster %s was created in an error state, and has been marked as tainted", clusterName) return waitErr } } @@ -4153,52 +4143,6 @@ func resourceContainerClusterDelete(d *schema.ResourceData, meta interface{}) er return nil } -// cleanFailedContainerCluster deletes clusters that failed but were -// created in an error state. Similar to resourceContainerClusterDelete -// but implemented in separate function as it doesn't try to lock already -// locked cluster state, does different error handling, and doesn't do retries. -func cleanFailedContainerCluster(d *schema.ResourceData, meta interface{}) error { - config := meta.(*transport_tpg.Config) - - project, err := tpgresource.GetProject(d, config) - if err != nil { - return err - } - - location, err := tpgresource.GetLocation(d, config) - if err != nil { - return err - } - - userAgent, err := tpgresource.GenerateUserAgentString(d, config.UserAgent) - if err != nil { - return err - } - - clusterName := d.Get("name").(string) - fullName := containerClusterFullName(project, location, clusterName) - - log.Printf("[DEBUG] Cleaning up failed GKE cluster %s", d.Get("name").(string)) - clusterDeleteCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.Delete(fullName) - if config.UserProjectOverride { - clusterDeleteCall.Header().Add("X-Goog-User-Project", project) - } - op, err := clusterDeleteCall.Do() - if err != nil { - return transport_tpg.HandleNotFoundError(err, d, fmt.Sprintf("Container Cluster %q", d.Get("name").(string))) - } - - // Wait until it's deleted - waitErr := ContainerOperationWait(config, op, project, location, "deleting GKE cluster", userAgent, d.Timeout(schema.TimeoutDelete)) - if waitErr != nil { - return waitErr - } - - log.Printf("[INFO] GKE cluster %s has been deleted", d.Id()) - d.SetId("") - return nil -} - var containerClusterRestingStates = RestingStates{ "RUNNING": ReadyState, "DEGRADED": ErrorState, diff --git a/mmv1/third_party/terraform/services/container/resource_container_cluster_test.go.erb b/mmv1/third_party/terraform/services/container/resource_container_cluster_test.go.erb index 6b0db114d9dd..189b776fc7ae 100644 --- a/mmv1/third_party/terraform/services/container/resource_container_cluster_test.go.erb +++ b/mmv1/third_party/terraform/services/container/resource_container_cluster_test.go.erb @@ -3356,6 +3356,9 @@ func TestAccContainerCluster_autoprovisioningDefaultsManagement(t *testing.T) { }) } +// This resource originally cleaned up the dangling cluster directly, but now +// taints it, having Terraform clean it up during the next apply. This test +// name is now inexact, but is being preserved to maintain the test history. func TestAccContainerCluster_errorCleanDanglingCluster(t *testing.T) { t.Parallel() @@ -3376,18 +3379,18 @@ func TestAccContainerCluster_errorCleanDanglingCluster(t *testing.T) { Config: initConfig, }, { - ResourceName: "google_container_cluster.cidr_error_preempt", - ImportState: true, + ResourceName: "google_container_cluster.cidr_error_preempt", + ImportState: true, ImportStateVerify: true, }, { - Config: overlapConfig, + Config: overlapConfig, ExpectError: regexp.MustCompile("Error waiting for creating GKE cluster"), }, - // If dangling cluster wasn't deleted, this plan will return an error + // If tainted cluster won't be deleted, this step will return an error { - Config: overlapConfig, - PlanOnly: true, + Config: overlapConfig, + PlanOnly: true, ExpectNonEmptyPlan: true, }, },