Added a feature to increase the size of the data corpus for the http_…

…logs workload. Signed-off-by: Govind Kamat <[email protected]>
opensearch-project · Apr 27, 2023 · 30260cf · 30260cf
1 parent 2a7f9a7
commit 30260cf
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 33 deletions.
diff --git a/http_logs/README.md b/http_logs/README.md
@@ -49,6 +49,22 @@ node pipeline to run. Valid options are `'baseline'` (default), `'grok'`  and `'
 * `target_throughput` (default: default values for each operation): Number of requests per second, `none` for no limit.
 * `search_clients`: Number of clients that issues search requests.
 
+
+### Beta Feature: Increasing the size of the data corpus
+
+This workload provides for a feature to use a generated data corpus in lieu of the provided corpora files (which currently total ~31 GB.)  The generated corpus could, for instance, be 100 GB or more.  For more details on generating such a corpus, run the following command:
+
+```
+  expand-data-corpus.py -h
+```
+
+Once a corpus has been generated, it can be used for a test by supplying the following parameter via `--workoad-params`:
+
+* `generated_corpus:t`: Use the generated data corpus instead of the corpora files packaged with this track
+
+If there are multiple generated corpora files, they are all used concurrently.  Ingestion of the generated and the default corpora are mutually exclusive in any single OSB run.  Once ingested, however, queries packaged with this workload will operate on the entire loaded data set.
+
+
 ### License
 
 Original license text:

diff --git a/http_logs/workload.json b/http_logs/workload.json
@@ -8,41 +8,52 @@
   "description": "HTTP server log data",
   "#TODO": "Replace index definitions with a template after setting the workload version to 2. Explicit index definitions are not necessary anymore.",
   "indices": [
-    {
-      "name": "logs-181998",
-      "body": "{{ index_body }}"
-    },
-    {
-      "name": "logs-191998",
-      "body": "{{ index_body }}"
-    },
-    {
-      "name": "logs-201998",
-      "body": "{{ index_body }}"
-    },
-    {
-      "name": "logs-211998",
-      "body": "{{ index_body }}"
-    },
-    {
-      "name": "logs-221998",
-      "body": "{{ index_body }}"
-    },
-    {
-      "name": "logs-231998",
-      "body": "{{ index_body }}"
-    },
-    {
-      "name": "logs-241998",
-      "body": "{{ index_body }}"
-    },
-    {
-      "name": "reindexed-logs",
-      "body": "{{ index_body }}"
-    }
+    {%- if generated_corpus is defined %}
+      {{ benchmark.collect(parts="gen-idx-*.json") }}
+    {%- else %}
+      {
+	"name": "logs-181998",
+	"body": "{{ index_body }}"
+      },
+      {
+	"name": "logs-191998",
+	"body": "{{ index_body }}"
+      },
+      {
+	"name": "logs-201998",
+	"body": "{{ index_body }}"
+      },
+      {
+	"name": "logs-211998",
+	"body": "{{ index_body }}"
+      },
+      {
+	"name": "logs-221998",
+	"body": "{{ index_body }}"
+      },
+      {
+	"name": "logs-231998",
+	"body": "{{ index_body }}"
+      },
+      {
+	"name": "logs-241998",
+	"body": "{{ index_body }}"
+      },
+      {
+	"name": "reindexed-logs",
+	"body": "{{ index_body }}"
+      }
+    {%- endif %}
   ],
   "corpora": [
-      {%- if ingest_pipeline is defined and ingest_pipeline == "grok" %}
+      {%- if generated_corpus is defined %}
+        {
+          "name": "http_logs",
+          "documents": [
+            {{ benchmark.collect(parts="gen-docs-*.json") }}
+          ]
+        }
+      {%- elif ingest_pipeline is defined and ingest_pipeline == "grok" %}
         {
           "name": "http_logs_unparsed",
           "base-url": "https://opensearch-benchmark-workloads.s3.amazonaws.com/corpora/http_logs",