opensearch-project · IanHoang · Jan 30, 2024 · Jan 23, 2024
@@ -560,6 +560,12 @@ def add_workload_source(subparser):
              f"(default: {metrics.GlobalStatsCalculator.DEFAULT_LATENCY_PERCENTILES}).",
         default=metrics.GlobalStatsCalculator.DEFAULT_LATENCY_PERCENTILES
     )
+    test_execution_parser.add_argument(
+        "--throughput-percentiles",
+        help=f"A comma-separated list of percentiles to report for throughput, in addition to mean/median/max/min "
+             f"(default: {metrics.GlobalStatsCalculator.DEFAULT_THROUGHPUT_PERCENTILES}).",
+        default=metrics.GlobalStatsCalculator.DEFAULT_THROUGHPUT_PERCENTILES
+    )
 
     ###############################################################################
     #
@@ -869,6 +875,7 @@ def dispatch_sub_command(arg_parser, args, cfg):
                 opts.csv_to_list(args.load_worker_coordinator_hosts))
             cfg.add(config.Scope.applicationOverride, "workload", "test.mode.enabled", args.test_mode)
             cfg.add(config.Scope.applicationOverride, "workload", "latency.percentiles", args.latency_percentiles)
+            cfg.add(config.Scope.applicationOverride, "workload", "throughput.percentiles", args.throughput_percentiles)
             configure_workload_params(arg_parser, args, cfg)
             configure_connection_params(arg_parser, args, cfg)
             configure_telemetry_params(args, cfg)

@@ -336,7 +336,8 @@ def calculate_results(store, test_execution):
         store,
         test_execution.workload,
         test_execution.test_procedure,
-        latency_percentiles=test_execution.latency_percentiles
+        latency_percentiles=test_execution.latency_percentiles,
+        throughput_percentiles=test_execution.throughput_percentiles
         )
     return calc()
 
@@ -1297,13 +1298,19 @@ def create_test_execution(cfg, workload, test_procedure, workload_revision=None)
     plugin_params = cfg.opts("builder", "plugin.params")
     benchmark_version = version.version()
     benchmark_revision = version.revision()
-    latency_percentiles = cfg.opts("workload", "latency.percentiles", mandatory=False)
+    latency_percentiles = cfg.opts("workload", "latency.percentiles", mandatory=False,
+                                   default_value=GlobalStatsCalculator.DEFAULT_LATENCY_PERCENTILES)
+    throughput_percentiles = cfg.opts("workload", "throughput.percentiles", mandatory=False,
+                                      default_value=GlobalStatsCalculator.DEFAULT_THROUGHPUT_PERCENTILES)
+    # In tests, we don't get the default command-line arg value for percentiles,
+    # so supply them as defaults here as well
 
     return TestExecution(benchmark_version, benchmark_revision,
     environment, test_execution_id, test_execution_timestamp,
     pipeline, user_tags, workload,
     workload_params, test_procedure, provision_config_instance, provision_config_instance_params,
-    plugin_params, workload_revision, latency_percentiles=latency_percentiles)
+    plugin_params, workload_revision, latency_percentiles=latency_percentiles,
+    throughput_percentiles=throughput_percentiles)
 
 
 class TestExecution:
@@ -1313,7 +1320,7 @@ def __init__(self, benchmark_version, benchmark_revision, environment_name,
                  provision_config_instance_params, plugin_params,
                  workload_revision=None, provision_config_revision=None,
                  distribution_version=None, distribution_flavor=None,
-                 revision=None, results=None, meta_data=None, latency_percentiles=None):
+                 revision=None, results=None, meta_data=None, latency_percentiles=None, throughput_percentiles=None):
         if results is None:
             results = {}
         # this happens when the test execution is created initially
@@ -1326,6 +1333,8 @@ def __init__(self, benchmark_version, benchmark_revision, environment_name,
         if latency_percentiles:
             # split comma-separated string into list of floats
             latency_percentiles = [float(value) for value in latency_percentiles.split(",")]
+        if throughput_percentiles:
+            throughput_percentiles = [float(value) for value in throughput_percentiles.split(",")]
         self.benchmark_version = benchmark_version
         self.benchmark_revision = benchmark_revision
         self.environment_name = environment_name
@@ -1347,6 +1356,7 @@ def __init__(self, benchmark_version, benchmark_revision, environment_name,
         self.results = results
         self.meta_data = meta_data
         self.latency_percentiles = latency_percentiles
+        self.throughput_percentiles = throughput_percentiles
 
 
     @property
@@ -1684,46 +1694,56 @@ def filter_percentiles_by_sample_size(sample_size, percentiles):
         raise AssertionError("Percentiles require at least one sample")
 
     filtered_percentiles = []
-    # Treat the 1 and 2-10 case separately, to return p50 and p100 if present
+    # Treat the cases below 10 separately, to return p25, 50, 75, 100 if present
     if sample_size == 1:
         filtered_percentiles = [100]
-    elif sample_size < 10:
+    elif sample_size < 4:
         for p in [50, 100]:
             if p in percentiles:
                 filtered_percentiles.append(p)
+    elif sample_size < 10:
+        for p in [25, 50, 75, 100]:
+            if p in percentiles:
+                filtered_percentiles.append(p)
     else:
         effective_sample_size = 10 ** (int(math.log10(sample_size))) # round down to nearest power of ten
         delta = 0.000001 # If (p / 100) * effective_sample_size is within this value of a whole number,
         # assume the discrepancy is due to floating point and allow it
         for p in percentiles:
             fraction = p / 100
-
             # check if fraction * effective_sample_size is close enough to a whole number
-            if abs((effective_sample_size * fraction) - round(effective_sample_size*fraction)) < delta:
+            if abs((effective_sample_size * fraction) - round(effective_sample_size*fraction)) < delta or p in [25, 75]:
                 filtered_percentiles.append(p)
     # if no percentiles are suitable, just return 100
     if len(filtered_percentiles) == 0:
         return [100]
     return filtered_percentiles
 
-def percentiles_for_sample_size(sample_size, latency_percentiles=None):
+def percentiles_for_sample_size(sample_size, percentiles_list=None):
     # If latency_percentiles is present, as a list, display those values instead (assuming there are enough samples)
-    percentiles = GlobalStatsCalculator.DEFAULT_LATENCY_PERCENTILES_LIST
-    if latency_percentiles:
-        percentiles = latency_percentiles # Defaults get overridden if a value is provided
+    percentiles = []
+    if percentiles_list:
+        percentiles = percentiles_list # Defaults get overridden if a value is provided
         percentiles.sort()
     return filter_percentiles_by_sample_size(sample_size, percentiles)
 
 class GlobalStatsCalculator:
     DEFAULT_LATENCY_PERCENTILES = "50,90,99,99.9,99.99,100"
     DEFAULT_LATENCY_PERCENTILES_LIST = [float(value) for value in DEFAULT_LATENCY_PERCENTILES.split(",")]
 
-    def __init__(self, store, workload, test_procedure, latency_percentiles=None):
+    DEFAULT_THROUGHPUT_PERCENTILES = ""
+    DEFAULT_THROUGHPUT_PERCENTILES_LIST = []
+
+    OTHER_PERCENTILES = [50,90,99,99.9,99.99,100]
+    # Use these percentiles when the single_latency fn is called for something other than latency
+
+    def __init__(self, store, workload, test_procedure, latency_percentiles=None, throughput_percentiles=None):
         self.store = store
         self.logger = logging.getLogger(__name__)
         self.workload = workload
         self.test_procedure = test_procedure
         self.latency_percentiles = latency_percentiles
+        self.throughput_percentiles = throughput_percentiles
 
     def __call__(self):
         result = GlobalStats()
@@ -1739,7 +1759,7 @@ def __call__(self):
                     result.add_op_metrics(
                         t,
                         task.operation.name,
-                        self.summary_stats("throughput", t, op_type),
+                        self.summary_stats("throughput", t, op_type, percentiles_list=self.throughput_percentiles),
                         self.single_latency(t, op_type),
                         self.single_latency(t, op_type, metric_name="service_time"),
                         self.single_latency(t, op_type, metric_name="processing_time"),
@@ -1817,28 +1837,44 @@ def sum(self, metric_name):
     def one(self, metric_name):
         return self.store.get_one(metric_name)
 
-    def summary_stats(self, metric_name, task_name, operation_type):
+    def summary_stats(self, metric_name, task_name, operation_type, percentiles_list=None):
         mean = self.store.get_mean(metric_name, task=task_name, operation_type=operation_type, sample_type=SampleType.Normal)
         median = self.store.get_median(metric_name, task=task_name, operation_type=operation_type, sample_type=SampleType.Normal)
         unit = self.store.get_unit(metric_name, task=task_name, operation_type=operation_type)
         stats = self.store.get_stats(metric_name, task=task_name, operation_type=operation_type, sample_type=SampleType.Normal)
+
+        result = {}
         if mean and median and stats:
-            return {
+            result = {
                 "min": stats["min"],
                 "mean": mean,
                 "median": median,
                 "max": stats["max"],
                 "unit": unit
             }
         else:
-            return {
+            result = {
                 "min": None,
                 "mean": None,
                 "median": None,
                 "max": None,
                 "unit": unit
             }
 
+        if percentiles_list: # modified from single_latency()
+            sample_size = stats["count"]
+            percentiles = self.store.get_percentiles(metric_name,
+                                                     task=task_name,
+                                                     operation_type=operation_type,
+                                                     sample_type=SampleType.Normal,
+                                                     percentiles=percentiles_for_sample_size(
+                                                         sample_size,
+                                                         percentiles_list=percentiles_list))
+            for k, v in percentiles.items():
+                # safely encode so we don't have any dots in field names
+                result[encode_float_key(k)] = v
+        return result
+
     def shard_stats(self, metric_name):
         values = self.store.get_raw(metric_name, mapper=lambda doc: doc["per-shard"])
         unit = self.store.get_unit(metric_name)
@@ -1896,6 +1932,9 @@ def single_latency(self, task, operation_type, metric_name="latency"):
         sample_type = SampleType.Normal
         stats = self.store.get_stats(metric_name, task=task, operation_type=operation_type, sample_type=sample_type)
         sample_size = stats["count"] if stats else 0
+        percentiles_list = self.OTHER_PERCENTILES
+        if metric_name == "latency":
+            percentiles_list = self.latency_percentiles
         if sample_size > 0:
             # The custom latency percentiles have to be supplied here as the workload runs,
             # or else they aren't present when results are published
@@ -1905,7 +1944,7 @@ def single_latency(self, task, operation_type, metric_name="latency"):
                                                      sample_type=sample_type,
                                                      percentiles=percentiles_for_sample_size(
                                                          sample_size,
-                                                         latency_percentiles=self.latency_percentiles
+                                                         percentiles_list=percentiles_list
                                                          ))
             mean = self.store.get_mean(metric_name,
                                        task=task,

@@ -122,7 +122,10 @@ def __init__(self, results, config):
         self.show_processing_time = convert.to_bool(config.opts("results_publishing", "output.processingtime",
                                                                 mandatory=False, default_value=False))
         self.cwd = config.opts("node", "benchmark.cwd")
-        self.latency_percentiles = comma_separated_string_to_number_list(config.opts("workload", "latency.percentiles", mandatory=False))
+        self.display_percentiles = {
+            "throughput":comma_separated_string_to_number_list(config.opts("workload", "throughput.percentiles", mandatory=False)),
+            "latency": comma_separated_string_to_number_list(config.opts("workload", "latency.percentiles", mandatory=False))
+        }
 
     def publish(self):
         print_header(FINAL_SCORE)
@@ -184,7 +187,8 @@ def _publish_throughput(self, values, task):
             self._line("Min Throughput", task, throughput["min"], unit, lambda v: "%.2f" % v),
             self._line("Mean Throughput", task, throughput["mean"], unit, lambda v: "%.2f" % v),
             self._line("Median Throughput", task, throughput["median"], unit, lambda v: "%.2f" % v),
-            self._line("Max Throughput", task, throughput["max"], unit, lambda v: "%.2f" % v)
+            self._line("Max Throughput", task, throughput["max"], unit, lambda v: "%.2f" % v),
+            *self._publish_percentiles("throughput", task, throughput)
         )
 
     def _publish_latency(self, values, task):
@@ -198,8 +202,10 @@ def _publish_processing_time(self, values, task):
 
     def _publish_percentiles(self, name, task, value):
         lines = []
+        percentiles = self.display_percentiles.get(name, metrics.GlobalStatsCalculator.OTHER_PERCENTILES)
+
         if value:
-            for percentile in metrics.percentiles_for_sample_size(sys.maxsize, latency_percentiles=self.latency_percentiles):
+            for percentile in metrics.percentiles_for_sample_size(sys.maxsize, percentiles_list=percentiles):
                 percentile_value = value.get(metrics.encode_float_key(percentile))
                 a_line = self._line("%sth percentile %s" % (percentile, name), task, percentile_value, "ms",
                                     force=self.publish_all_percentile_values)
@@ -436,7 +442,7 @@ def _publish_processing_time(self, baseline_stats, contender_stats, task):
 
     def _publish_percentiles(self, name, task, baseline_values, contender_values):
         lines = []
-        for percentile in metrics.percentiles_for_sample_size(sys.maxsize, latency_percentiles=self.latency_percentiles):
+        for percentile in metrics.percentiles_for_sample_size(sys.maxsize, percentiles_list=self.latency_percentiles):
             baseline_value = baseline_values.get(metrics.encode_float_key(percentile))
             contender_value = contender_values.get(metrics.encode_float_key(percentile))
             self._append_non_empty(lines, self._line("%sth percentile %s" % (percentile, name),

@@ -1797,9 +1797,11 @@ def test_filter_percentiles_by_sample_size(self):
             4,
             10,
             10.01,
+            25,
             45,
             46.001,
             50,
+            75,
             80.1,
             90,
             98.9,
@@ -1813,14 +1815,16 @@ def test_filter_percentiles_by_sample_size(self):
             100]
         sample_size_to_result_map = {
             1: [100],
-            5: [50, 100],
-            10: [0, 10, 50, 90, 100],
-            99: [0, 10, 50, 90, 100],
-            100: [0, 4, 10, 45, 50, 90, 99, 100],
-            1000: [0, 0.1, 4, 10, 45, 50, 80.1, 90, 98.9, 99, 99.9, 100],
-            10000: [0, 0.01, 0.1, 4, 10, 10.01, 45, 50, 80.1, 90, 98.9, 98.91, 99, 99.9, 99.99, 100],
-            100000: [0, 0.001, 0.01, 0.1, 4, 10, 10.01, 45, 46.001, 50, 80.1, 90, 98.9, 98.91, 98.999, 99, 99.9, 99.99, 99.999, 100],
-            1000000: [0, 0.0001, 0.001, 0.01, 0.1, 4, 10, 10.01, 45, 46.001, 50,
+            2: [50, 100],
+            4: [25, 50, 75, 100],
+            10: [0, 10, 25, 50, 75, 90, 100],
+            99: [0, 10, 25, 50, 75, 90, 100],
+            100: [0, 4, 10, 25, 45, 50, 75, 90, 99, 100],
+            1000: [0, 0.1, 4, 10, 25, 45, 50, 75, 80.1, 90, 98.9, 99, 99.9, 100],
+            10000: [0, 0.01, 0.1, 4, 10, 10.01, 25, 45, 50, 75, 80.1, 90, 98.9, 98.91, 99, 99.9, 99.99, 100],
+            100000: [0, 0.001, 0.01, 0.1, 4, 10, 10.01, 25, 45, 46.001, 50, 75,
+                     80.1, 90, 98.9, 98.91, 98.999, 99, 99.9, 99.99, 99.999, 100],
+            1000000: [0, 0.0001, 0.001, 0.01, 0.1, 4, 10, 10.01, 25, 45, 46.001, 50, 75,
                       80.1, 90, 98.9, 98.91, 98.999, 99, 99.9, 99.99, 99.999, 99.9999, 100]
         } # 100,000 corresponds to 0.001% which is the order of magnitude we round to,
         # so at higher orders (>=1M samples) all values are permitted