From c70ebbf716485504729f24b1812dce31d634a027 Mon Sep 17 00:00:00 2001 From: Harald Musum Date: Thu, 7 Nov 2024 08:22:39 +0100 Subject: [PATCH] Increase time before failing a node when it is unresponsive --- .../provision/maintenance/NodeRepositoryMaintenance.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index ea3c8748d41..b6c383282be 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -18,6 +18,8 @@ import java.util.List; import java.util.concurrent.CopyOnWriteArrayList; +import static com.yahoo.config.provision.CloudName.YAHOO; + /** * A component which sets up all the node repo maintenance jobs. * @@ -144,7 +146,8 @@ private static class DefaultTimes { hostResumeProvisionerInterval = Duration.ofMinutes(3); diskReplacerInterval = Duration.ofMinutes(3); failedExpirerInterval = Duration.ofMinutes(10); - failGrace = Duration.ofMinutes(10); + // Nodes in Yahoo cloud need more time to start, so give those longer time before failing them (need more than 10 mins) + failGrace = zone.cloud().name() == YAHOO ? Duration.ofMinutes(20) : Duration.ofMinutes(10); infrastructureProvisionInterval = Duration.ofMinutes(3); loadBalancerExpirerInterval = Duration.ofMinutes(5); loadBalancerPreProvisionerInterval = Duration.ofMinutes(1);