diff --git a/osbenchmark/workload/loader.py b/osbenchmark/workload/loader.py index ce75d1236..aaff6fcb6 100644 --- a/osbenchmark/workload/loader.py +++ b/osbenchmark/workload/loader.py @@ -490,7 +490,8 @@ def download(self, base_url, source_url, target_path, size_in_bytes): self.logger.info("Downloading data from [%s] to [%s].", data_url, target_path) # we want to have a bit more accurate download progress as these files are typically very large - progress = net.Progress("[INFO] Downloading workload data", accuracy=1) + progress = net.Progress("[INFO] Downloading workload data: " + os.path.basename(target_path), + accuracy=1) net.download(data_url, target_path, size_in_bytes, progress_indicator=progress) progress.finish() self.logger.info("Downloaded data from [%s] to [%s].", data_url, target_path) @@ -504,7 +505,7 @@ def download(self, base_url, source_url, target_path, size_in_bytes): msg += f" (HTTP status: {e.code}, reason: {e.reason})" else: msg += f" (HTTP status: {e.code})" - raise exceptions.DataError(msg) from e + raise exceptions.DataError(msg, e) from None except urllib.error.URLError as e: raise exceptions.DataError(f"Could not download [{data_url}] to [{target_path}].") from e @@ -523,6 +524,7 @@ def __init__(self, workload_name, downloader, decompressor): self.workload_name = workload_name self.downloader = downloader self.decompressor = decompressor + self.logger = logging.getLogger(__name__) def is_locally_available(self, file_name): return os.path.isfile(file_name) @@ -586,6 +588,12 @@ def prepare_document_set(self, document_set, data_root): else: raise if document_set.support_file_offset_table: + if not document_set.source_url: + try: + self.downloader.download(document_set.base_url, None, doc_path + '.offset', None) + except exceptions.DataError as e: + if type(e.cause) == urllib.error.HTTPError and (e.cause.code == 403 or e.cause.code == 404): + self.logger.info("Pre-generated offset file not found, will generate from corpus data") self.create_file_offset_table(doc_path, document_set.number_of_lines) def prepare_bundled_document_set(self, document_set, data_root): diff --git a/tests/workload/loader_test.py b/tests/workload/loader_test.py index fcf40d5f1..788fc87ec 100644 --- a/tests/workload/loader_test.py +++ b/tests/workload/loader_test.py @@ -261,12 +261,12 @@ def test_download_document_archive_if_no_file_available(self, is_file, get_size, # after download uncompressed file still does not exist (in main loop) # after download compressed file exists (in main loop) # after decompression, uncompressed file exists - is_file.side_effect = [False, False, True, False, True, True, True] + is_file.side_effect = [False, False, True, False, True, True, True, True] # compressed file size is 200 after download # compressed file size is 200 after download (in main loop) # uncompressed file size is 2000 after decompression # uncompressed file size is 2000 after decompression (in main loop) - get_size.side_effect = [200, 200, 2000, 2000] + get_size.side_effect = [200, 200, 2000, 2000, None] prepare_file_offset_table.return_value = 5 @@ -285,8 +285,11 @@ def test_download_document_archive_if_no_file_available(self, is_file, get_size, ensure_dir.assert_called_with("/tmp") decompress.assert_called_with("/tmp/docs.json.bz2", "/tmp") - download.assert_called_with("http://benchmarks.opensearch.org/corpora/unit-test/docs.json.bz2", - "/tmp/docs.json.bz2", 200, progress_indicator=mock.ANY) + calls = [ mock.call("http://benchmarks.opensearch.org/corpora/unit-test/docs.json.bz2", + "/tmp/docs.json.bz2", 200, progress_indicator=mock.ANY), + mock.call("http://benchmarks.opensearch.org/corpora/unit-test/docs.json.offset", + "/tmp/docs.json.offset", None, progress_indicator=mock.ANY) ] + download.assert_has_calls(calls) prepare_file_offset_table.assert_called_with("/tmp/docs.json") @mock.patch("osbenchmark.utils.io.prepare_file_offset_table") @@ -303,12 +306,12 @@ def test_download_document_archive_with_source_url_compressed(self, is_file, get # after download uncompressed file still does not exist (in main loop) # after download compressed file exists (in main loop) # after decompression, uncompressed file exists - is_file.side_effect = [False, False, True, False, True, True, True] + is_file.side_effect = [False, False, True, False, True, True, True, True] # compressed file size is 200 after download # compressed file size is 200 after download (in main loop) # uncompressed file size is 2000 after decompression # uncompressed file size is 2000 after decompression (in main loop) - get_size.side_effect = [200, 200, 2000, 2000] + get_size.side_effect = [200, 200, 2000, 2000, None] prepare_file_offset_table.return_value = 5 @@ -381,7 +384,7 @@ def test_download_document_with_trailing_baseurl_slash(self, is_file, get_size, # uncompressed file does not exist # after download uncompressed file exists # after download uncompressed file exists (main loop) - is_file.side_effect = [False, True, True] + is_file.side_effect = [False, True, True, True] # uncompressed file size is 2000 get_size.return_value = 2000 scheme = random.choice(["http", "https", "s3", "gs"]) @@ -403,8 +406,9 @@ def test_download_document_with_trailing_baseurl_slash(self, is_file, get_size, data_root="/tmp") ensure_dir.assert_called_with("/tmp") - download.assert_called_with(f"{scheme}://benchmarks.opensearch.org/corpora/unit-test/docs.json", - "/tmp/docs.json", 2000, progress_indicator=mock.ANY) + calls = [ mock.call(f"{scheme}://benchmarks.opensearch.org/corpora/unit-test/docs.json", "/tmp/docs.json", 2000, progress_indicator=mock.ANY), + mock.call(f"{scheme}://benchmarks.opensearch.org/corpora/unit-test/docs.json.offset", "/tmp/docs.json.offset", None, progress_indicator=mock.ANY) ] + download.assert_has_calls(calls) prepare_file_offset_table.assert_called_with("/tmp/docs.json") @mock.patch("osbenchmark.utils.io.prepare_file_offset_table") @@ -416,7 +420,7 @@ def test_download_document_file_if_no_file_available(self, is_file, get_size, en # uncompressed file does not exist # after download uncompressed file exists # after download uncompressed file exists (main loop) - is_file.side_effect = [False, True, True] + is_file.side_effect = [False, True, True, True] # uncompressed file size is 2000 get_size.return_value = 2000 @@ -437,8 +441,9 @@ def test_download_document_file_if_no_file_available(self, is_file, get_size, en data_root="/tmp") ensure_dir.assert_called_with("/tmp") - download.assert_called_with("http://benchmarks.opensearch.org/corpora/unit-test/docs.json", - "/tmp/docs.json", 2000, progress_indicator=mock.ANY) + calls = [ mock.call("http://benchmarks.opensearch.org/corpora/unit-test/docs.json", "/tmp/docs.json", 2000, progress_indicator=mock.ANY), + mock.call("http://benchmarks.opensearch.org/corpora/unit-test/docs.json.offset", "/tmp/docs.json.offset", None, progress_indicator=mock.ANY) ] + download.assert_has_calls(calls) prepare_file_offset_table.assert_called_with("/tmp/docs.json") @mock.patch("osbenchmark.utils.net.download")