Track dataset size (#1827)

With this commit we introduce a new metric "Datset size" that tracks not only local disk usage but also disk usage of partially mounted snapshots. This allows to track disk usage also in Serverless.
elastic · Jan 30, 2024 · 5e6b1bc · 5e6b1bc
1 parent 2c59d03
commit 5e6b1bc
Show file tree

Hide file tree

Showing 6 changed files with 24 additions and 0 deletions.
diff --git a/docs/metrics.rst b/docs/metrics.rst
@@ -150,6 +150,7 @@ Rally stores the following metrics:
 * ``flush_total_time``: Cumulative time used for index flush of primary shards, as reported by the index stats API. Note that this is not Wall clock time.  These metrics records also have a ``per-shard`` property that contains the times across primary shards in an array.
 * ``flush_total_count``: Cumulative number of flushes of primary shards, as reported by index stats API under ``_all/primaries``.
 * ``final_index_size_bytes``: Final resulting index size on the file system after all nodes have been shutdown at the end of the benchmark. It includes all files in the nodes' data directories (actual index files and translog).
+* ``dataset_size_in_bytes``: Total data set size in bytes of the index. This includes the size of shards not stored fully on nodes, such as the cache for partially mounted indices.
 * ``store_size_in_bytes``: The size in bytes of the index (excluding the translog), as reported by the index stats API.
 * ``translog_size_in_bytes``: The size in bytes of the translog, as reported by the index stats API.
 * ``ml_processing_time``: A structure containing the minimum, mean, median and maximum bucket processing time in milliseconds per machine learning job. These metrics are only available if a machine learning job has been created in the respective benchmark.
diff --git a/docs/summary_report.rst b/docs/summary_report.rst
@@ -151,6 +151,12 @@ Total ZGC Pauses GC count
 * **Definition**: The total number of Stop-The-World pauses performed by ZGC across the whole cluster as reported by the node stats API.
 * **Corresponding metrics key**: ``node_total_zgc_pauses_gc_count``
 
+Dataset size
+------------
+
+* **Definition**: Total data set size in bytes of the index. This includes the size of shards not stored fully on nodes, such as the cache for partially mounted indices.
+* **Corresponding metrics key**: ``dataset_size_in_bytes``
+
 Store size
 ----------
 

diff --git a/esrally/metrics.py b/esrally/metrics.py
@@ -2078,6 +2078,7 @@ def __call__(self):
         result.memory_norms = self.median("segments_norms_memory_in_bytes")
         result.memory_points = self.median("segments_points_memory_in_bytes")
         result.memory_stored_fields = self.median("segments_stored_fields_memory_in_bytes")
+        result.dataset_size = self.sum("dataset_size_in_bytes")
         result.store_size = self.sum("store_size_in_bytes")
         result.translog_size = self.sum("translog_size_in_bytes")
 
@@ -2264,6 +2265,7 @@ def __init__(self, d=None):
         self.memory_norms = self.v(d, "memory_norms")
         self.memory_points = self.v(d, "memory_points")
         self.memory_stored_fields = self.v(d, "memory_stored_fields")
+        self.dataset_size = self.v(d, "dataset_size")
         self.store_size = self.v(d, "store_size")
         self.translog_size = self.v(d, "translog_size")
         self.segment_count = self.v(d, "segment_count")

diff --git a/esrally/reporter.py b/esrally/reporter.py
@@ -292,6 +292,7 @@ def _report_gc_metrics(self, stats):
 
     def _report_disk_usage(self, stats):
         return self._join(
+            self._line("Dataset size", "", stats.dataset_size, "GB", convert.bytes_to_gb),
             self._line("Store size", "", stats.store_size, "GB", convert.bytes_to_gb),
             self._line("Translog size", "", stats.translog_size, "GB", convert.bytes_to_gb),
         )
@@ -886,6 +887,15 @@ def _count_metric(metric_prefix, description):
 
     def _report_disk_usage(self, baseline_stats, contender_stats):
         return self._join(
+            self._line(
+                "Dataset size",
+                baseline_stats.dataset_size,
+                contender_stats.dataset_size,
+                "",
+                "GB",
+                treat_increase_as_improvement=False,
+                formatter=convert.bytes_to_gb,
+            ),
             self._line(
                 "Store size",
                 baseline_stats.store_size,

diff --git a/esrally/telemetry.py b/esrally/telemetry.py
@@ -2039,6 +2039,9 @@ def on_benchmark_stop(self):
         self.add_metrics(self.extract_value(p, ["segments", "terms_memory_in_bytes"]), "segments_terms_memory_in_bytes", "byte")
         self.add_metrics(self.extract_value(p, ["segments", "norms_memory_in_bytes"]), "segments_norms_memory_in_bytes", "byte")
         self.add_metrics(self.extract_value(p, ["segments", "points_memory_in_bytes"]), "segments_points_memory_in_bytes", "byte")
+        self.add_metrics(
+            self.extract_value(index_stats, ["_all", "total", "store", "total_data_set_size_in_bytes"]), "dataset_size_in_bytes", "byte"
+        )
         self.add_metrics(self.extract_value(index_stats, ["_all", "total", "store", "size_in_bytes"]), "store_size_in_bytes", "byte")
         self.add_metrics(self.extract_value(index_stats, ["_all", "total", "translog", "size_in_bytes"]), "translog_size_in_bytes", "byte")
 

diff --git a/tests/telemetry_test.py b/tests/telemetry_test.py
@@ -4025,6 +4025,7 @@ def test_stores_available_index_stats(self, metrics_store_cluster_value, metrics
                 "total": {
                     "store": {
                         "size_in_bytes": 2113867510,
+                        "total_data_set_size_in_bytes": 112113867510,
                     },
                     "translog": {
                         "operations": 6840000,
@@ -4208,6 +4209,7 @@ def test_stores_available_index_stats(self, metrics_store_cluster_value, metrics
                 mock.call("segments_terms_memory_in_bytes", 256, "byte"),
                 # we don't have norms, so nothing should have been called
                 mock.call("store_size_in_bytes", 2113867510, "byte"),
+                mock.call("dataset_size_in_bytes", 112113867510, "byte"),
                 mock.call("translog_size_in_bytes", 2647984713, "byte"),
             ],
             any_order=True,