-
Notifications
You must be signed in to change notification settings - Fork 80
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds controllable randomization to range queries in workloads #455
Changes from 1 commit
ebfbc87
58ce803
76b725e
45c3088
ec46b7b
413ea98
d7a3ca8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -926,7 +926,7 @@ class QueryRandomizerWorkloadProcessor(WorkloadProcessor): | |
DEFAULT_N = 5000 | ||
def __init__(self, cfg): | ||
self.randomization_enabled = cfg.opts("workload", "randomization.enabled", mandatory=False, default_value=False) | ||
self.rf = float(cfg.opts("workload", "randomization.rf", mandatory=False, default_value=self.DEFAULT_RF)) | ||
self.rf = float(cfg.opts("workload", "randomization.repeat_frequency", mandatory=False, default_value=self.DEFAULT_RF)) | ||
self.logger = logging.getLogger(__name__) | ||
self.N = int(cfg.opts("workload", "randomization.n", mandatory=False, default_value=self.DEFAULT_N)) | ||
self.zipf_alpha = 1 | ||
|
@@ -949,7 +949,8 @@ def zipf_cdf_inverse(self, u, H_list): | |
# https://math.stackexchange.com/questions/53671/how-to-calculate-the-inverse-cdf-for-the-zipf-distribution | ||
# Precompute all values H_i,alpha for a fixed alpha and pass in as H_list | ||
if (u < 0 or u >= 1): | ||
raise Exception("Input u must have 0 <= u < 1") | ||
raise exceptions.ExecutorError( | ||
"Input u must have 0 <= u < 1. This error shouldn't appear, please raise an issue if it does") | ||
n = len(H_list) | ||
candidate_return = 1 | ||
denominator = self.H(n, H_list) | ||
|
@@ -999,7 +1000,9 @@ def extract_fields_and_paths(self, params): | |
try: | ||
root = params["body"]["query"] | ||
except KeyError: | ||
raise exceptions.SystemSetupError("Cannot extract range query fields from these params, missing params[\"body\"][\"query\"]") | ||
raise exceptions.SystemSetupError( | ||
f"Cannot extract range query fields from these params: {params}\n, missing params[\"body\"][\"query\"]\n" | ||
f"Make sure the operation in operations/default.json is well-formed") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice! |
||
fields_and_paths = self.extract_fields_helper(root, []) | ||
return fields_and_paths | ||
|
||
|
@@ -1056,7 +1059,9 @@ def create_param_source_lambda(self, op_name, get_standard_value, get_standard_v | |
|
||
def on_after_load_workload(self, input_workload, **kwargs): | ||
if not self.randomization_enabled: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Logging here would be good too. Something along the lines of:
|
||
self.logger.info("Query randomization is disabled.") | ||
return input_workload | ||
self.logger.info("Query randomization is enabled, with repeat frequency = {}, n = {}".format(self.rf, self.N)) | ||
|
||
# By default, use params for standard values and generate new standard values the first time an op/field is seen. | ||
# In unit tests, we should be able to supply our own sources independent of params. | ||
|
@@ -1072,7 +1077,7 @@ def on_after_load_workload(self, input_workload, **kwargs): | |
default_test_procedure = None | ||
for test_procedure in input_workload.test_procedures: | ||
if test_procedure.default: | ||
default_test_procedure = test_procedure # TODO - not sure if this is correct | ||
default_test_procedure = test_procedure | ||
break | ||
|
||
for task in default_test_procedure.schedule: | ||
|
@@ -1081,6 +1086,9 @@ def on_after_load_workload(self, input_workload, **kwargs): | |
op_type = workload.OperationType.from_hyphenated_string(leaf_task.operation.type) | ||
except KeyError: | ||
op_type = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: For debugging purposes, it would be nice to have a logging statement in the exception here. |
||
self.logger.warn( | ||
f"QueryRandomizerWorkloadProcessor found operation {leaf_task.operation.name} in default schedule" | ||
f" with type {leaf_task.operation.type}, which couldn't be converted to a known OperationType") | ||
if op_type == workload.OperationType.Search: | ||
op_name = leaf_task.operation.name | ||
param_source_name = op_name + "-randomized" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should aim to use full-length arguments as best practice, such as
--randomization-repeat-frequency
, in addition to the short-hand format, like--randomization-rf
or-rrf
. That way users don't have to look up what the meaning of the flags as much.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@peteralfonsi Integration tests are failing due to extra parameter added has
--
and is not considered a "short-form" flag. Think you'll need to change this to "-rf", "-rrf", or some iteration with a single hyphen.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ahh - thanks, that probably would have taken me a while to figure out!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After second glance,
--randomization-repeat-frequency
is a bit lengthy and can be cumbersome for users to type out every time. Short-form flags should be used for more commonly used flags like--target-hosts
or--workload
.--randomization-repeat-frequency
is a specific use-case and I think it will be fine to keep it as--randomization-rf
.For this option, we can either do what you did before and keep
--randomization-rf
or we can have the full-length version--randomization-repeat-frequency
and a short-form version-rf
. I'll leave it up to your preference!