Skip to content

Commit

Permalink
Minor updates for multi-part downloads (#678)
Browse files Browse the repository at this point in the history
Signed-off-by: Govind Kamat <[email protected]>
  • Loading branch information
gkamat authored Oct 9, 2024
1 parent 4609bfd commit 2596bb8
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 7 deletions.
17 changes: 11 additions & 6 deletions osbenchmark/workload/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,12 +582,17 @@ def prepare_document_set(self, document_set, data_root):
if document_set.document_file_parts:
for part in document_set.document_file_parts:
self.downloader.download(document_set.base_url, None, os.path.join(data_root, part["name"]), part["size"])
with open(target_path, "wb") as outfile:
console.info(f"Concatenating file parts {', '.join([p['name'] for p in document_set.document_file_parts])}"
"into {os.path.basename(target_path)}", flush=True, logger=self.logger)
for part in document_set.document_file_parts:
with open(os.path.join(data_root, part["name"]), "rb") as infile:
shutil.copyfileobj(infile, outfile)
try:
with open(target_path, "wb") as outfile:
console.info(f"Concatenating file parts {', '.join([p['name'] for p in document_set.document_file_parts])}"
f" into {os.path.basename(target_path)}", flush=True, logger=self.logger)
for part in document_set.document_file_parts:
part_name = os.path.join(data_root, part["name"])
with open(part_name, "rb") as infile:
shutil.copyfileobj(infile, outfile)
os.remove(part_name)
except Exception as e:
raise exceptions.DataError(f"Encountered exception {repr(e)} when building corpus file from parts")
else:
self.downloader.download(document_set.base_url, document_set.source_url, target_path, expected_size)
except exceptions.DataError as e:
Expand Down
3 changes: 2 additions & 1 deletion tests/workload/loader_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,8 @@ class WorkloadPreparationTests_1(TestCase):
@mock.patch("osbenchmark.utils.io.ensure_dir")
@mock.patch("os.path.getsize")
@mock.patch("os.path.isfile")
def test_download_document_file_from_part_files(self, is_file, get_size, ensure_dir, download, prepare_file_offset_table):
@mock.patch("os.remove")
def test_download_document_file_from_part_files(self, rm_file, is_file, get_size, ensure_dir, download, prepare_file_offset_table):
# uncompressed file does not exist
# after download uncompressed file exists
# after download uncompressed file exists (main loop)
Expand Down

0 comments on commit 2596bb8

Please sign in to comment.