Skip to content

Commit

Permalink
Make archive extraction always take members into account (#316)
Browse files Browse the repository at this point in the history
The logic whether extraction needs to be run depended on the target directory
existing on the file system. This made it impossible to extract multiple times
from the same archive with differing members. Change the behaviour to checking
whether all required files already present. In the process, also normalize the
members parameter to not have leading `./`.
  • Loading branch information
dokempf authored Aug 8, 2022
1 parent 65d74c9 commit 68ae6de
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 9 deletions.
41 changes: 32 additions & 9 deletions pooch/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,36 @@ def __call__(self, fname, action, pooch):
else:
archive_dir = fname.rsplit(os.path.sep, maxsplit=1)[0]
self.extract_dir = os.path.join(archive_dir, self.extract_dir)
if action in ("update", "download") or not os.path.exists(self.extract_dir):
if (
(action in ("update", "download"))
or (not os.path.exists(self.extract_dir))
or (
(self.members is not None)
and (
not all(
os.path.exists(os.path.join(self.extract_dir, m))
for m in self.members
)
)
)
):
# Make sure that the folder with the extracted files exists
os.makedirs(self.extract_dir, exist_ok=True)
self._extract_file(fname, self.extract_dir)

# Get a list of all file names (including subdirectories) in our folder
# of unzipped files.
fnames = [
os.path.join(path, fname)
for path, _, files in os.walk(self.extract_dir)
for fname in files
]
# of unzipped files, filtered by the given members list
fnames = []
for path, _, files in os.walk(self.extract_dir):
for filename in files:
relpath = os.path.normpath(
os.path.join(os.path.relpath(path, self.extract_dir), filename)
)
if self.members is None or any(
relpath.startswith(os.path.normpath(m)) for m in self.members
):
fnames.append(os.path.join(path, filename))

return fnames

def _extract_file(self, fname, extract_dir):
Expand Down Expand Up @@ -153,7 +172,9 @@ def _extract_file(self, fname, extract_dir):
# Based on:
# https://stackoverflow.com/questions/8008829/extract-only-a-single-directory-from-tar
subdir_members = [
name for name in zip_file.namelist() if name.startswith(member)
name
for name in zip_file.namelist()
if os.path.normpath(name).startswith(os.path.normpath(member))
]
# Extract the data file from within the archive
zip_file.extractall(members=subdir_members, path=extract_dir)
Expand Down Expand Up @@ -216,7 +237,9 @@ def _extract_file(self, fname, extract_dir):
subdir_members = [
info
for info in tar_file.getmembers()
if info.name.startswith(member)
if os.path.normpath(info.name).startswith(
os.path.normpath(member)
)
]
# Extract the data file from within the archive
tar_file.extractall(members=subdir_members, path=extract_dir)
Expand Down
51 changes: 51 additions & 0 deletions pooch/tests/test_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,57 @@ def test_unpacking(processor_class, extension, target_path, archive, members):
check_tiny_data(fname)


@pytest.mark.network
@pytest.mark.parametrize(
"processor_class,extension",
[(Unzip, ".zip"), (Untar, ".tar.gz")],
)
def test_multiple_unpacking(processor_class, extension):
"Test that multiple subsequent calls to a processor yield correct results"

with TemporaryDirectory() as local_store:
pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY)

# Do a first fetch with the one member only
processor1 = processor_class(members=["store/tiny-data.txt"])
filenames1 = pup.fetch("store" + extension, processor=processor1)
assert len(filenames1) == 1
check_tiny_data(filenames1[0])

# Do a second fetch with the other member
processor2 = processor_class(
members=["store/tiny-data.txt", "store/subdir/tiny-data.txt"]
)
filenames2 = pup.fetch("store" + extension, processor=processor2)
assert len(filenames2) == 2
check_tiny_data(filenames2[0])
check_tiny_data(filenames2[1])

# Do a third fetch, again with one member and assert
# that only this member was returned
filenames3 = pup.fetch("store" + extension, processor=processor1)
assert len(filenames3) == 1
check_tiny_data(filenames3[0])


@pytest.mark.network
@pytest.mark.parametrize(
"processor_class,extension",
[(Unzip, ".zip"), (Untar, ".tar.gz")],
)
def test_unpack_members_with_leading_dot(processor_class, extension):
"Test that unpack members can also be specifed both with a leading ./"

with TemporaryDirectory() as local_store:
pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY)

# Do a first fetch with the one member only
processor1 = processor_class(members=["./store/tiny-data.txt"])
filenames1 = pup.fetch("store" + extension, processor=processor1)
assert len(filenames1) == 1
check_tiny_data(filenames1[0])


def _check_logs(log_file, expected_lines):
"""
Assert that the lines in the log match the expected ones.
Expand Down

0 comments on commit 68ae6de

Please sign in to comment.