From 5538546e9f1d73fce58c4834164cc1045811426d Mon Sep 17 00:00:00 2001
From: Ian Hoang <51065478+IanHoang@users.noreply.github.com>
Date: Thu, 25 Jan 2024 13:10:36 -0600
Subject: [PATCH] Create-Workload Improvements: Separate operations and test
 procedures from workload.json (#446)

Signed-off-by: Ian Hoang <hoangia@amazon.com>
Co-authored-by: Ian Hoang <hoangia@amazon.com>
---
 osbenchmark/resources/base-workload.json.j2   | 35 ++------
 .../resources/custom-operations.json.j2       | 27 +++++++
 .../resources/custom-query-workload.json.j2   | 14 ----
 .../resources/custom-test-procedures.json.j2  | 64 +++++++++++++++
 .../resources/default-operations.json.j2      | 28 +++++++
 .../resources/default-query-workload.json.j2  | 16 ----
 .../resources/default-test-procedures.json.j2 | 59 ++++++++++++++
 .../workload_generator/workload_generator.py  | 79 +++++++++++++++----
 8 files changed, 247 insertions(+), 75 deletions(-)
 create mode 100644 osbenchmark/resources/custom-operations.json.j2
 delete mode 100644 osbenchmark/resources/custom-query-workload.json.j2
 create mode 100644 osbenchmark/resources/custom-test-procedures.json.j2
 create mode 100644 osbenchmark/resources/default-operations.json.j2
 delete mode 100644 osbenchmark/resources/default-query-workload.json.j2
 create mode 100644 osbenchmark/resources/default-test-procedures.json.j2

diff --git a/osbenchmark/resources/base-workload.json.j2 b/osbenchmark/resources/base-workload.json.j2
index cfb6ced98..ca0c5668a 100644
--- a/osbenchmark/resources/base-workload.json.j2
+++ b/osbenchmark/resources/base-workload.json.j2
@@ -22,35 +22,10 @@
       ]
     }{% endfor %}
   ],
-  "schedule": [
-    {
-      "operation": "delete-index"
-    },{% raw %}
-    {
-      "operation": {
-        "operation-type": "create-index",
-        "settings": {{index_settings | default({}) | tojson}}
-      }
-    },{% endraw %}
-    {
-      "operation": {
-        "operation-type": "cluster-health",
-        "index": {{ indices | map(attribute='name') | list | join(',') | tojson }},{% raw %}
-        "request-params": {
-          "wait_for_status": "{{cluster_health | default('green')}}",
-          "wait_for_no_relocating_shards": "true"
-        },
-        "retry-until-success": true
-      }
-    },
-    {
-      "operation": {
-        "operation-type": "bulk",
-        "bulk-size": {{bulk_size | default(5000)}},
-        "ingest-percentage": {{ingest_percentage | default(100)}}
-      },
-      "clients": {{bulk_indexing_clients | default(8)}}
-    },{% endraw -%}
-    {% block queries %}{% endblock %}
+  "operations": [
+    {% raw %}{{ benchmark.collect(parts="operations/*.json") }}{% endraw %}
+  ],
+  "test_procedures": [
+    {% raw %}{{ benchmark.collect(parts="test_procedures/*.json") }}{% endraw %}
   ]
 }
diff --git a/osbenchmark/resources/custom-operations.json.j2 b/osbenchmark/resources/custom-operations.json.j2
new file mode 100644
index 000000000..16fb38606
--- /dev/null
+++ b/osbenchmark/resources/custom-operations.json.j2
@@ -0,0 +1,27 @@
+{
+    "name": "index-append",
+    "operation-type": "bulk",
+    "bulk-size": {{bulk_size | default(5000)}},
+    "ingest-percentage": {{ingest_percentage | default(100)}}
+},
+{
+    "name": "wait-until-merges-finish",
+    "operation-type": "index-stats",
+    "index": "_all",
+    "condition": {
+      "path": "_all.total.merges.current",
+      "expected-value": 0
+    },
+    "retry-until-success": true,
+    "include-in-reporting": false
+},
+{%- block queries -%}
+{% for query in custom_queries %}
+{
+  "name": "{{query.name}}",
+  "operation-type": "{{query['operation-type']}}",
+  "index": {{ indices | map(attribute='name') | list | join(',') | tojson }},
+  "body": {{query.body | replace("'", '"') }}
+}{% if not loop.last %},{% endif -%}
+{% endfor %}
+{%- endblock %}
\ No newline at end of file
diff --git a/osbenchmark/resources/custom-query-workload.json.j2 b/osbenchmark/resources/custom-query-workload.json.j2
deleted file mode 100644
index 76028db77..000000000
--- a/osbenchmark/resources/custom-query-workload.json.j2
+++ /dev/null
@@ -1,14 +0,0 @@
-{% extends "base-workload.json.j2" %}
-
-{%- block queries -%}
-    {% for query in custom_queries %}
-    {
-      "operation": {
-        "name": "{{query.name}}",
-        "operation-type": "{{query['operation-type']}}",
-        "index": {{ indices | map(attribute='name') | list | join(',') | tojson }},
-        "body": {{query.body | replace("'", '"') }}
-      }
-    }{% if not loop.last %},{% endif -%}
-    {% endfor %}
-{%- endblock %}
diff --git a/osbenchmark/resources/custom-test-procedures.json.j2 b/osbenchmark/resources/custom-test-procedures.json.j2
new file mode 100644
index 000000000..5fc247db1
--- /dev/null
+++ b/osbenchmark/resources/custom-test-procedures.json.j2
@@ -0,0 +1,64 @@
+{
+  "name": "custom-test-procedures",
+  "description": "Customized test procedure with custom operations generated by create-workload feature in OpenSearch Benchmark.",
+  "default": true,
+  "schedule": [
+    {
+      "operation": "delete-index"
+    },
+    {
+      "operation": {
+        "operation-type": "create-index",
+        {% raw %}"settings": {{ index_settings | default({}) | tojson }}
+      {% endraw %}}
+    },
+    {
+      "operation": {
+        "operation-type": "cluster-health",
+        "index": {{ indices | map(attribute='name') | list | join(',') | tojson }},
+        "request-params": {
+          {% raw %}"wait_for_status": "{{ cluster_health | default('green') }}",
+          {% endraw -%}"wait_for_no_relocating_shards": "true"
+        },
+        "retry-until-success": true
+      }
+    },
+    {
+      "operation": "index-append",
+      {% raw -%}"clients": {{ bulk_indexing_clients | default(8) }},
+      {% endraw -%}
+      {% raw -%}"ignore-response-error-level": "{{ error_level | default('non-fatal') }}"
+    {% endraw -%}},
+    {
+      "name": "refresh-after-index",
+      "operation": "refresh"
+    },
+    {
+      "operation": {
+        "operation-type": "force-merge",
+        "request-timeout": 7200{%- if force_merge_max_num_segments is defined %},
+        "max-num-segments": {{ force_merge_max_num_segments | tojson }}
+        {%- endif %}
+      }
+    },
+    {
+      "name": "refresh-after-force-merge",
+      "operation": "refresh"
+    },
+    {
+      "operation": "wait-until-merges-finish"
+    },
+{%- block queries -%}
+    {% for query in custom_queries %}
+    {
+      "operation":"{{query.name}}",
+      {% raw -%}
+      "warmup-iterations": {{ warmup_iterations | default(50) }},
+      "iterations": {{ iterations | default(100) }},
+      "target-throughput": {{ target_throughput | default(3) }},
+      "clients": {{ search_clients | default(1) }}
+    {% endraw -%}}{% if not loop.last %},{% endif -%}
+    {% endfor %}
+{%- endblock %}
+  ]
+}
diff --git a/osbenchmark/resources/default-operations.json.j2 b/osbenchmark/resources/default-operations.json.j2
new file mode 100644
index 000000000..969a3b913
--- /dev/null
+++ b/osbenchmark/resources/default-operations.json.j2
@@ -0,0 +1,28 @@
+{
+    "name": "index-append",
+    "operation-type": "bulk",
+    "bulk-size": {{bulk_size | default(5000)}},
+    "ingest-percentage": {{ingest_percentage | default(100)}}
+},
+{
+    "name": "wait-until-merges-finish",
+    "operation-type": "index-stats",
+    "index": "_all",
+    "condition": {
+      "path": "_all.total.merges.current",
+      "expected-value": 0
+    },
+    "retry-until-success": true,
+    "include-in-reporting": false
+},
+{
+    "name": "match-all",
+    "operation-type": "search",
+    "index": {{ indices | map(attribute='name') | list | join(',') | tojson }},
+    "body": {
+        "size": {{match_all_size | default(10)}},
+        "query": {
+            "match_all": {}
+        }
+    }
+}
\ No newline at end of file
diff --git a/osbenchmark/resources/default-query-workload.json.j2 b/osbenchmark/resources/default-query-workload.json.j2
deleted file mode 100644
index 38c296738..000000000
--- a/osbenchmark/resources/default-query-workload.json.j2
+++ /dev/null
@@ -1,16 +0,0 @@
-{% extends "base-workload.json.j2" %}
-
-{% block queries %}
-    {
-      "operation": {
-        "operation-type": "search",
-        "index": {{ indices | map(attribute='name') | list | join(',') | tojson }},
-        "body": {
-          "query": {
-            "match_all": {}
-          }
-        }
-      },{% raw %}
-      "clients": {{search_clients | default(8)}}{% endraw %}
-    }
-{%- endblock %}
\ No newline at end of file
diff --git a/osbenchmark/resources/default-test-procedures.json.j2 b/osbenchmark/resources/default-test-procedures.json.j2
new file mode 100644
index 000000000..b8f4c73da
--- /dev/null
+++ b/osbenchmark/resources/default-test-procedures.json.j2
@@ -0,0 +1,59 @@
+{
+  "name": "default-test-procedure",
+  "description": "Customized test procedure with custom operations generated by create-workload feature in OpenSearch Benchmark. Workload deletes existing indexes, creates indexes, ingests documents, and runs default match-all query.",
+  "default": true,
+  "schedule": [
+    {
+      "operation": "delete-index"
+    },
+    {
+      "operation": {
+        "operation-type": "create-index",
+        {% raw %}"settings": {{ index_settings | default({}) | tojson }}
+      {% endraw %}}
+    },
+    {
+      "operation": {
+        "operation-type": "cluster-health",
+        "index": {{ indices | map(attribute='name') | list | join(',') | tojson }},
+        "request-params": {
+          {% raw %}"wait_for_status": "{{ cluster_health | default('green') }}",
+          {% endraw -%}"wait_for_no_relocating_shards": "true"
+        },
+        "retry-until-success": true
+      }
+    },
+    {
+      "operation": "index-append",
+      {% raw -%}"clients": {{ bulk_indexing_clients | default(8) }},
+      {% endraw -%}
+      {% raw -%}"ignore-response-error-level": "{{ error_level | default('non-fatal') }}"
+    {% endraw -%}},
+    {
+      "name": "refresh-after-index",
+      "operation": "refresh"
+    },
+    {
+      "operation": {
+        "operation-type": "force-merge",
+        "request-timeout": 7200{%- if force_merge_max_num_segments is defined %},
+        "max-num-segments": {{ force_merge_max_num_segments | tojson }}
+        {%- endif %}
+      }
+    },
+    {
+      "name": "refresh-after-force-merge",
+      "operation": "refresh"
+    },
+    {
+      "operation": "wait-until-merges-finish"
+    },{% raw %}
+    {
+      "operation": "match-all",
+      "warmup-iterations": {{ warmup_iterations | default(50) }},
+      "iterations": {{ iterations | default(100) }},
+      "target-throughput": {{ target_throughput | default(3) }},
+      "clients": {{ search_clients | default(1) }}
+    }{% endraw -%}
+  ]
+}
\ No newline at end of file
diff --git a/osbenchmark/workload_generator/workload_generator.py b/osbenchmark/workload_generator/workload_generator.py
index 3bcaacc42..60163701e 100644
--- a/osbenchmark/workload_generator/workload_generator.py
+++ b/osbenchmark/workload_generator/workload_generator.py
@@ -24,6 +24,7 @@
 
 import logging
 import os
+import shutil
 import json
 
 from opensearchpy import OpenSearchException
@@ -34,14 +35,6 @@
 from osbenchmark.workload_generator import corpus, index
 from osbenchmark.utils import io, opts, console
 
-
-def process_template(templates_path, template_filename, template_vars, output_path):
-    env = Environment(loader=FileSystemLoader(templates_path), autoescape=select_autoescape(['html', 'xml']))
-    template = env.get_template(template_filename)
-
-    with open(output_path, "w") as f:
-        f.write(template.render(template_vars))
-
 def validate_indices_docs_map(indices, indices_docs_map, docs_were_requested):
     if not docs_were_requested:
         return
@@ -108,9 +101,37 @@ def process_custom_queries(custom_queries):
 
     return data
 
+def write_template(template_vars, templates_path, output_path, template_file):
+    template = get_template(template_file, templates_path)
+    with open(output_path, "w") as f:
+        f.write(template.render(template_vars))
+
+def get_template(template_file, templates_path):
+    template_file_name = template_file  + ".json.j2"
+
+    env = Environment(loader=FileSystemLoader(templates_path), autoescape=select_autoescape(['html', 'xml']))
+
+    return env.get_template(template_file_name)
+
+def render_templates(workload_path,
+                    operations_path,
+                    test_procedures_path,
+                    templates_path,
+                    template_vars,
+                    custom_queries):
+    write_template(template_vars, templates_path, workload_path, "base-workload")
+
+    if custom_queries:
+        write_template(template_vars, templates_path, operations_path, "custom-operations")
+        write_template(template_vars, templates_path, test_procedures_path, "custom-test-procedures")
+    else:
+        write_template(template_vars, templates_path, operations_path, "default-operations")
+        write_template(template_vars, templates_path, test_procedures_path, "default-test-procedures")
+
 def create_workload(cfg):
     logger = logging.getLogger(__name__)
 
+    # All inputs provided by user
     workload_name = cfg.opts("workload", "workload.name")
     indices = cfg.opts("generator", "indices")
     root_path = cfg.opts("generator", "output.path")
@@ -118,25 +139,46 @@ def create_workload(cfg):
     client_options = cfg.opts("client", "options")
     number_of_docs = cfg.opts("generator", "number_of_docs")
     unprocessed_custom_queries = cfg.opts("workload", "custom_queries")
+    templates_path = os.path.join(cfg.opts("node", "benchmark.root"), "resources")
 
+    # Process custom queries
     custom_queries = process_custom_queries(unprocessed_custom_queries)
 
     logger.info("Creating workload [%s] matching indices [%s]", workload_name, indices)
     logger.info("Number of Docs: %s", number_of_docs)
+
+    # Initialize client factory
     client = OsClientFactory(hosts=target_hosts.all_hosts[opts.TargetHosts.DEFAULT],
                              client_options=client_options.all_client_options[opts.TargetHosts.DEFAULT]).create()
-
     info = client.info()
     console.info(f"Connected to OpenSearch cluster [{info['name']}] version [{info['version']['number']}].\n", logger=logger)
 
+    # Establish output paths directory
     output_path = os.path.abspath(os.path.join(io.normalize_path(root_path), workload_name))
+
+    operations_path = os.path.join(output_path, "operations")
+    test_procedures_path = os.path.join(output_path, "test_procedures")
+
+    if os.path.exists(output_path):
+        try:
+            logger.info("Workload already exists. Removing existing workload [%s] in path [%s]", workload_name, output_path)
+            shutil.rmtree(output_path)
+        except OSError:
+            logger.error("Had issues removing existing workload [%s] in path [%s]", workload_name, output_path)
+
     io.ensure_dir(output_path)
+    io.ensure_dir(operations_path)
+    io.ensure_dir(test_procedures_path)
 
+    # Extract Indices and Corpora
+    logger.info("Extracting indices and corpora")
     indices, corpora = extract_mappings_and_corpora(client, output_path, indices, number_of_docs)
+    logger.info("Finished extracting indices and corpora")
 
     if len(indices) == 0:
         raise RuntimeError("Failed to extract any indices for workload!")
 
+    # Collect all itmes into dictionary
     template_vars = {
         "workload_name": workload_name,
         "indices": indices,
@@ -147,12 +189,19 @@ def create_workload(cfg):
     logger.info("Template Vars: %s", template_vars)
 
     workload_path = os.path.join(output_path, "workload.json")
-    templates_path = os.path.join(cfg.opts("node", "benchmark.root"), "resources")
-
-    if custom_queries:
-        process_template(templates_path, "custom-query-workload.json.j2", template_vars, workload_path)
-    else:
-        process_template(templates_path, "default-query-workload.json.j2", template_vars, workload_path)
+    operations_path = os.path.join(operations_path, "default.json")
+    test_procedures_path = os.path.join(test_procedures_path, "default.json")
+
+    # Render all templates
+    logger.info("Rendering templates")
+    render_templates(
+        workload_path,
+        operations_path,
+        test_procedures_path,
+        templates_path,
+        template_vars,
+        custom_queries
+    )
 
     console.println("")
     console.info(f"Workload {workload_name} has been created. Run it with: {PROGRAM_NAME} --workload-path={output_path}")