From 05b7abd2a18b03e00b77a7954d82590651d811cd Mon Sep 17 00:00:00 2001 From: Govind Kamat Date: Wed, 9 Oct 2024 09:19:55 -0700 Subject: [PATCH] Minor updates for multi-part downloads (#678) Signed-off-by: Govind Kamat --- osbenchmark/workload/loader.py | 17 +++++++++++------ tests/workload/loader_test.py | 3 ++- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/osbenchmark/workload/loader.py b/osbenchmark/workload/loader.py index 96298010..a97f85fe 100644 --- a/osbenchmark/workload/loader.py +++ b/osbenchmark/workload/loader.py @@ -582,12 +582,17 @@ def prepare_document_set(self, document_set, data_root): if document_set.document_file_parts: for part in document_set.document_file_parts: self.downloader.download(document_set.base_url, None, os.path.join(data_root, part["name"]), part["size"]) - with open(target_path, "wb") as outfile: - console.info(f"Concatenating file parts {', '.join([p['name'] for p in document_set.document_file_parts])}" - "into {os.path.basename(target_path)}", flush=True, logger=self.logger) - for part in document_set.document_file_parts: - with open(os.path.join(data_root, part["name"]), "rb") as infile: - shutil.copyfileobj(infile, outfile) + try: + with open(target_path, "wb") as outfile: + console.info(f"Concatenating file parts {', '.join([p['name'] for p in document_set.document_file_parts])}" + f" into {os.path.basename(target_path)}", flush=True, logger=self.logger) + for part in document_set.document_file_parts: + part_name = os.path.join(data_root, part["name"]) + with open(part_name, "rb") as infile: + shutil.copyfileobj(infile, outfile) + os.remove(part_name) + except Exception as e: + raise exceptions.DataError(f"Encountered exception {repr(e)} when building corpus file from parts") else: self.downloader.download(document_set.base_url, document_set.source_url, target_path, expected_size) except exceptions.DataError as e: diff --git a/tests/workload/loader_test.py b/tests/workload/loader_test.py index bff592bb..562afea0 100644 --- a/tests/workload/loader_test.py +++ b/tests/workload/loader_test.py @@ -869,7 +869,8 @@ class WorkloadPreparationTests_1(TestCase): @mock.patch("osbenchmark.utils.io.ensure_dir") @mock.patch("os.path.getsize") @mock.patch("os.path.isfile") - def test_download_document_file_from_part_files(self, is_file, get_size, ensure_dir, download, prepare_file_offset_table): + @mock.patch("os.remove") + def test_download_document_file_from_part_files(self, rm_file, is_file, get_size, ensure_dir, download, prepare_file_offset_table): # uncompressed file does not exist # after download uncompressed file exists # after download uncompressed file exists (main loop)