diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/common/RateLimiter.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/common/RateLimiter.java index bbc5082d45004..b74e473155aec 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/common/RateLimiter.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/common/RateLimiter.java @@ -28,6 +28,14 @@ * * By setting the accumulated tokens limit to a value greater than zero, it effectively allows bursts of traffic. If the accumulated * tokens limit is set to zero, it will force the acquiring thread to wait on each call. + * + * Example: + * Time unit: Second + * Tokens to produce per time unit: 10 + * Limit for tokens in bucket: 100 + * + * Tokens in bucket after n seconds (n second -> tokens in bucket): + * 1 sec -> 10 tokens, 2 sec -> 20 tokens, ... , 10 sec -> 100 tokens (bucket full), ... 200 sec -> 100 tokens (no increase in tokens) */ public class RateLimiter { @@ -76,6 +84,7 @@ public final synchronized void setRate(double newAccumulatedTokensLimit, double throw new IllegalArgumentException(Strings.format("Tokens per time unit must be less than or equal to %s", Double.MAX_VALUE)); } + // If the new token limit is smaller than what we've accumulated already we need to drop tokens to meet the new token limit accumulatedTokens = Math.min(accumulatedTokens, newAccumulatedTokensLimit); accumulatedTokensLimit = newAccumulatedTokensLimit; @@ -88,7 +97,8 @@ public final synchronized void setRate(double newAccumulatedTokensLimit, double } /** - * Causes the thread to wait until the tokens are available + * Causes the thread to wait until the tokens are available. + * This reserves token in advance leading to a reduction of accumulated tokens. * @param tokens the number of items of work that should be throttled, typically you'd pass a value of 1 here * @throws InterruptedException _ */ @@ -130,6 +140,7 @@ private static void validateTokenRequest(int tokens) { /** * Returns the amount of time to wait for the tokens to become available. + * This reserves tokens in advance leading to a reduction of accumulated tokens. * @param tokens the number of items of work that should be throttled, typically you'd pass a value of 1 here. Must be greater than 0. * @return the amount of time to wait */