Skip to content

Commit

Permalink
Updates to the expand-data-corpus script.
Browse files Browse the repository at this point in the history
Signed-off-by: Govind Kamat <[email protected]>
  • Loading branch information
gkamat committed Aug 6, 2024
1 parent 61ec362 commit 49dff84
Showing 1 changed file with 18 additions and 9 deletions.
27 changes: 18 additions & 9 deletions scripts/expand-data-corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@

help_msg = """
NOTE: This is a beta feature. The user model, interface and options
are subject to change.
This tool is intended for the purpose of expanding the size of the
data corpus associated an OSB workload. Currently, this capability is
implemented only for the http_logs workload.
Expand Down Expand Up @@ -106,6 +103,12 @@
def handler(signum, frame):
sys.exit(1)


def error_exit(script_name, message):
print(f'{script_name}: {message}', file=sys.stderr)
sys.exit(1)


class DocGenerator:

def __init__(self,
Expand Down Expand Up @@ -150,7 +153,8 @@ def error(self, message):
self.usage_msg()


def generate_docs(workload: str,
def generate_docs(script_name: str,
workload: str,
repository: str,
input_file: str,
output_file_suffix: str,
Expand All @@ -165,12 +169,17 @@ def generate_docs(workload: str,
#
config = configparser.ConfigParser()
benchmark_home = os.environ.get('BENCHMARK_HOME') or os.environ['HOME']
config.read(benchmark_home + '/.benchmark/benchmark.ini')
benchmark_ini = benchmark_home + '/.benchmark/benchmark.ini'
if not os.path.isfile(benchmark_ini):
error_exit(script_name, f"could not find OSB config file {benchmark_ini}, run a workload first to create it")
config.read(benchmark_ini)

root_dir = config['node']['root.dir']
workload_dir= root_dir + '/workloads/' + repository + '/' + workload
data_dir = config['benchmarks']['local.dataset.cache'] + '/' + workload

if not os.path.exists(data_dir):
error_exit(script_name, f"workload data directory {data_dir} does not exist, run the appropriate workload first to create it")
output_file = data_dir + '/documents-' + output_file_suffix + '.json'
if '/' not in input_file:
input_file = data_dir + '/' + input_file
Expand Down Expand Up @@ -274,8 +283,6 @@ def main(args: list) -> None:
output_file_suffix = args.output_file_suffix
n_docs = args.number_of_docs
corpus_size = args.corpus_size
interval = args.interval if args.interval is not None else \
corpus_size * -2
start_timestamp = args.start_timestamp
batch_size = args.batch_size

Expand All @@ -286,12 +293,14 @@ def main(args: list) -> None:
elif not n_docs and not corpus_size:
parser.usage_msg(script_name +
": must specify number of documents or corpus size")

interval = args.interval if args.interval is not None else \
corpus_size * -2
if workload != 'http_logs':
parser.usage_msg(script_name +
': only the "http_logs" workload is currently supported')

generate_docs(workload,
generate_docs(script_name,
workload,
repository,
input_file,
output_file_suffix,
Expand Down

0 comments on commit 49dff84

Please sign in to comment.