Skip to content

Commit

Permalink
Merge pull request #32789 from vespa-engine/bratseth/resource-validation
Browse files Browse the repository at this point in the history
Merge redundancy and resource reducation validation
  • Loading branch information
mpolden authored Nov 7, 2024
2 parents 1d18196 + b622827 commit ac85862
Show file tree
Hide file tree
Showing 10 changed files with 171 additions and 181 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@ public enum ValidationId {
accessControl("access-control"), // Internal use, used in zones where there should be no access-control
globalEndpointChange("global-endpoint-change"), // Changing global endpoints
zoneEndpointChange("zone-endpoint-change"), // Changing zone (possibly private) endpoint settings
redundancyIncrease("redundancy-increase"), // Increasing redundancy - may easily cause feed blocked
redundancyOne("redundancy-one"), // redundancy=1 requires a validation override on first deployment
pagedSettingRemoval("paged-setting-removal"), // May cause content nodes to run out of memory
certificateRemoval("certificate-removal"); // Remove data plane certificates
certificateRemoval("certificate-removal"), // Remove data plane certificates

@Deprecated
redundancyIncrease("redundancy-increase"); // Not in use. TODO: Remove on Vespa 9

private final String id;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -644,12 +644,12 @@ public ConfigModelRepo configModelRepo() {
/** If provisioning through the node repo, returns the provision requests issued during build of this */
public Provisioned provisioned() { return provisioned; }

/** Returns the id of all clusters in this */
public Set<ClusterSpec.Id> allClusters() {
/** Returns the spedc of all clusters in this */
public Set<ClusterSpec> allClusters() {
return hostSystem().getHosts().stream()
.map(HostResource::spec)
.filter(spec -> spec.membership().isPresent())
.map(spec -> spec.membership().get().cluster().id())
.map(spec -> spec.membership().get().cluster())
.collect(Collectors.toCollection(LinkedHashSet::new));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ private void validateBudget(BigDecimal budget, Context context,
var application = context.model().applicationPackage().getApplicationId();

var maxSpend = 0.0;
for (var id : context.model().allClusters()) {
if (adminClusterIds(context.model()).contains(id)) continue;
var cluster = context.model().provisioned().clusters().get(id);
var capacity = context.model().provisioned().capacities().getOrDefault(id, zeroCapacity);
for (var spec : context.model().allClusters()) {
if (adminClusterIds(context.model()).contains(spec.id())) continue;
var cluster = context.model().provisioned().clusters().get(spec.id());
var capacity = context.model().provisioned().capacities().getOrDefault(spec.id(), zeroCapacity);
maxSpend += capacityPolicies.applyOn(capacity, cluster.isExclusive()).maxResources().cost();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import com.yahoo.vespa.model.application.validation.change.IndexedSearchClusterChangeValidator;
import com.yahoo.vespa.model.application.validation.change.IndexingModeChangeValidator;
import com.yahoo.vespa.model.application.validation.change.NodeResourceChangeValidator;
import com.yahoo.vespa.model.application.validation.change.RedundancyIncreaseValidator;
import com.yahoo.vespa.model.application.validation.change.ResourcesReductionValidator;
import com.yahoo.vespa.model.application.validation.change.RestartOnDeployForLocalLLMValidator;
import com.yahoo.vespa.model.application.validation.change.RestartOnDeployForOnnxModelChangesValidator;
Expand Down Expand Up @@ -128,7 +127,6 @@ private static void validateChanges(Execution execution) {
new ResourcesReductionValidator().validate(execution);
new ContainerRestartValidator().validate(execution);
new NodeResourceChangeValidator().validate(execution);
new RedundancyIncreaseValidator().validate(execution);
new CertificateRemovalChangeValidator().validate(execution);
new RedundancyValidator().validate(execution);
new RestartOnDeployForOnnxModelChangesValidator().validate(execution);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,32 +26,32 @@ public class NodeResourceChangeValidator implements ChangeValidator {

@Override
public void validate(ChangeContext context) {
for (ClusterSpec.Id clusterId : context.previousModel().allClusters()) {
Optional<NodeResources> currentResources = resourcesOf(clusterId, context.previousModel());
Optional<NodeResources> nextResources = resourcesOf(clusterId, context.model());
for (ClusterSpec cluster : context.previousModel().allClusters()) {
Optional<NodeResources> currentResources = resourcesOf(cluster, context.previousModel());
Optional<NodeResources> nextResources = resourcesOf(cluster, context.model());
if (currentResources.isEmpty() || nextResources.isEmpty()) continue; // new or removed cluster
if ( changeRequiresRestart(currentResources.get(), nextResources.get()))
createRestartActionsFor(clusterId, context.previousModel()).forEach(context::require);
createRestartActionsFor(cluster, context.previousModel()).forEach(context::require);
}
}

private boolean changeRequiresRestart(NodeResources currentResources, NodeResources nextResources) {
return currentResources.memoryGiB() != nextResources.memoryGiB();
}

private Optional<NodeResources> resourcesOf(ClusterSpec.Id clusterId, VespaModel model) {
private Optional<NodeResources> resourcesOf(ClusterSpec cluster, VespaModel model) {
return model.allocatedHosts().getHosts().stream().filter(host -> host.membership().isPresent())
.filter(host -> host.membership().get().cluster().id().equals(clusterId))
.filter(host -> host.membership().get().cluster().id().equals(cluster.id()))
.findFirst()
.map(HostSpec::advertisedResources);
}

private List<ConfigChangeAction> createRestartActionsFor(ClusterSpec.Id clusterId, VespaModel model) {
ApplicationContainerCluster containerCluster = model.getContainerClusters().get(clusterId.value());
private List<ConfigChangeAction> createRestartActionsFor(ClusterSpec cluster, VespaModel model) {
ApplicationContainerCluster containerCluster = model.getContainerClusters().get(cluster.id().value());
if (containerCluster != null)
return createRestartActionsFor(containerCluster);

ContentCluster contentCluster = model.getContentClusters().get(clusterId.value());
ContentCluster contentCluster = model.getContentClusters().get(cluster.id().value());
if (contentCluster != null)
return createRestartActionsFor(contentCluster);

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,17 @@
import com.yahoo.config.provision.NodeResources;
import com.yahoo.vespa.model.VespaModel;
import com.yahoo.vespa.model.application.validation.Validation.ChangeContext;
import com.yahoo.vespa.model.content.cluster.ContentCluster;

/**
* Checks that no cluster sizes are reduced too much in one go.
* Checks that resources per document per node is reduced too much in one go.
*
* @author bratseth
*/
public class ResourcesReductionValidator implements ChangeValidator {

private static final double maxAllowedUtilizationIncrease = 2.0; // Allow max doubling of utilization

@Override
public void validate(ChangeContext context) {
for (var clusterId : context.previousModel().allClusters()) {
Expand All @@ -23,35 +26,66 @@ public void validate(ChangeContext context) {
}
}

private void validate(ClusterSpec.Id clusterId, ChangeContext context) {
ClusterResources current = clusterResources(clusterId, context.previousModel());
ClusterResources next = clusterResources(clusterId, context.model());
if (current == null || next == null) return; // No request recording - test
private void validate(ClusterSpec cluster, ChangeContext context) {
ClusterResources current = clusterResources(cluster.id(), context.previousModel());
ClusterResources next = clusterResources(cluster.id(), context.model());
if (current == null || next == null) return;

double loadIncreasePerNode;
StringBuilder changes = new StringBuilder();
if ( ! cluster.type().isContent()) {
loadIncreasePerNode = (double)current.nodes() / next.nodes();
if (current.nodes() != next.nodes())
changes.append("from ").append(current.nodes()).append(" nodes to ")
.append(next.nodes()).append(" nodes").append(", ");
}
else { // take data size per node given from redundancy and groups into account
ContentCluster currentCluster = context.previousModel().getContentClusters().get(cluster.id().value());
ContentCluster nextCluster = context.model().getContentClusters().get(cluster.id().value());
loadIncreasePerNode =
(double) nextCluster.getRedundancy().finalRedundancy() / currentCluster.getRedundancy().finalRedundancy()
*
(double) currentCluster.groupSize() / nextCluster.groupSize();
if (nextCluster.getRedundancy().finalRedundancy() != currentCluster.getRedundancy().finalRedundancy())
changes.append("redundancy from ").append(currentCluster.getRedundancy().finalRedundancy())
.append(" to ").append(nextCluster.getRedundancy().finalRedundancy()).append(", ");
if (nextCluster.groupSize() != currentCluster.groupSize())
changes.append("group size from ").append(currentCluster.groupSize())
.append(" to ").append(nextCluster.groupSize()).append(" nodes, ");
}

if (current.nodeResources().isUnspecified() || next.nodeResources().isUnspecified()) {
// Self-hosted - unspecified resources; compare node count
int currentNodes = current.nodes();
int nextNodes = next.nodes();
if (nextNodes < 0.5 * currentNodes && nextNodes != currentNodes - 1) {
context.invalid(ValidationId.resourcesReduction,
"Size reduction in '" + clusterId.value() + "' is too large: " +
"To guard against mistakes, the new max nodes must be at least 50% of the current nodes. " +
"Current nodes: " + currentNodes + ", new nodes: " + nextNodes);
}
// Self-hosted: We don't know node resources so assume they are constant
if (loadIncreasePerNode > maxAllowedUtilizationIncrease)
invalid(changes, cluster, context);
}
else {
NodeResources currentResources = current.totalResources();
NodeResources nextResources = next.totalResources();
if (nextResources.vcpu() < 0.5 * currentResources.vcpu() ||
nextResources.memoryGiB() < 0.5 * currentResources.memoryGiB() ||
nextResources.diskGb() < 0.5 * currentResources.diskGb())
context.invalid(ValidationId.resourcesReduction,
"Resource reduction in '" + clusterId.value() + "' is too large: " +
"To guard against mistakes, the new max resources must be at least 50% of the current " +
"max resources in all dimensions. " +
"Current: " + currentResources.withBandwidthGbps(0) + // (don't output bandwidth here)
", new: " + nextResources.withBandwidthGbps(0));
NodeResources currentResources = current.nodeResources();
NodeResources nextResources = next.nodeResources();
if (invalid(currentResources.vcpu(), nextResources.vcpu(), "vcpu", loadIncreasePerNode, changes)
|| invalid(currentResources.memoryGiB(), nextResources.memoryGiB(), "memory GiB", loadIncreasePerNode, changes)
|| invalid(currentResources.diskGb(), nextResources.diskGb(), "disk Gb", loadIncreasePerNode, changes))
invalid(changes, cluster, context);
}
}

private boolean invalid(double current, double next, String resource, double loadIncreasePerNode,
StringBuilder changes) {
if (loadIncreasePerNode * current / next > maxAllowedUtilizationIncrease) {
if (current != next)
changes.append("from ").append(current).append(" to ").append(next)
.append(" ").append(resource).append(" per node, ");
return true;
}
return false;
}

private void invalid(StringBuilder changes, ClusterSpec cluster, ChangeContext context) {
changes.setLength(changes.length() - 2); // Remove last ", "
context.invalid(ValidationId.resourcesReduction,
"Effective resource reduction in " + cluster.id() + " is too large: " +
changes +
". To protect against mistakes, changes causing load increases of more than 100% are blocked");
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,10 @@ public boolean isGloballyDistributed(NewDocumentType docType) {

public Redundancy getRedundancy() { return redundancy; }

public int groupSize() {
return getNodeCount() / getRootGroup().getNumberOfLeafGroups();
}

public ContentCluster setRedundancy(Redundancy redundancy) {
this.redundancy = redundancy;
return this;
Expand Down

This file was deleted.

Loading

0 comments on commit ac85862

Please sign in to comment.