From 49dff84e7aaa8dddffbed7e5210c84951cf0c4be Mon Sep 17 00:00:00 2001
From: Govind Kamat <govkamat@amazon.com>
Date: Sun, 21 Jul 2024 20:22:39 -0700
Subject: [PATCH] Updates to the expand-data-corpus script.

Signed-off-by: Govind Kamat <govkamat@amazon.com>
---
 scripts/expand-data-corpus.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/scripts/expand-data-corpus.py b/scripts/expand-data-corpus.py
index d27134f54..b1b7d8e38 100755
--- a/scripts/expand-data-corpus.py
+++ b/scripts/expand-data-corpus.py
@@ -19,9 +19,6 @@
 
 help_msg = """
 
-NOTE: This is a beta feature.  The user model, interface and options
-are subject to change.
-
 This tool is intended for the purpose of expanding the size of the
 data corpus associated an OSB workload.  Currently, this capability is
 implemented only for the http_logs workload.
@@ -106,6 +103,12 @@
 def handler(signum, frame):
     sys.exit(1)
 
+
+def error_exit(script_name, message):
+    print(f'{script_name}: {message}', file=sys.stderr)
+    sys.exit(1)
+
+
 class DocGenerator:
 
     def __init__(self,
@@ -150,7 +153,8 @@ def error(self, message):
         self.usage_msg()
 
 
-def generate_docs(workload: str,
+def generate_docs(script_name: str,
+                  workload: str,
                   repository: str,
                   input_file: str,
                   output_file_suffix: str,
@@ -165,12 +169,17 @@ def generate_docs(workload: str,
     #
     config = configparser.ConfigParser()
     benchmark_home = os.environ.get('BENCHMARK_HOME') or os.environ['HOME']
-    config.read(benchmark_home + '/.benchmark/benchmark.ini')
+    benchmark_ini = benchmark_home + '/.benchmark/benchmark.ini'
+    if not os.path.isfile(benchmark_ini):
+        error_exit(script_name, f"could not find OSB config file {benchmark_ini}, run a workload first to create it")
+    config.read(benchmark_ini)
 
     root_dir = config['node']['root.dir']
     workload_dir= root_dir + '/workloads/' + repository + '/' + workload
     data_dir = config['benchmarks']['local.dataset.cache'] + '/' + workload
 
+    if not os.path.exists(data_dir):
+        error_exit(script_name, f"workload data directory {data_dir} does not exist, run the appropriate workload first to create it")
     output_file = data_dir + '/documents-' + output_file_suffix + '.json'
     if '/' not in input_file:
         input_file = data_dir + '/' + input_file
@@ -274,8 +283,6 @@ def main(args: list) -> None:
     output_file_suffix = args.output_file_suffix
     n_docs = args.number_of_docs
     corpus_size = args.corpus_size
-    interval = args.interval if args.interval is not None else \
-			corpus_size * -2
     start_timestamp = args.start_timestamp
     batch_size = args.batch_size
 
@@ -286,12 +293,14 @@ def main(args: list) -> None:
     elif not n_docs and not corpus_size:
         parser.usage_msg(script_name +
                      ": must specify number of documents or corpus size")
-
+    interval = args.interval if args.interval is not None else \
+			corpus_size * -2
     if workload != 'http_logs':
         parser.usage_msg(script_name +
                      ': only the "http_logs" workload is currently supported')
 
-    generate_docs(workload,
+    generate_docs(script_name,
+                  workload,
                   repository,
                   input_file,
                   output_file_suffix,