diff --git a/dvc/fs/base.py b/dvc/fs/base.py index 604bed5447..7aa349b303 100644 --- a/dvc/fs/base.py +++ b/dvc/fs/base.py @@ -175,9 +175,12 @@ def walk_files(self, path_info, **kwargs): """ raise NotImplementedError - def ls(self, path_info, detail=False, **kwargs): + def ls(self, path_info, detail=False): raise RemoteActionNotImplemented("ls", self.scheme) + def find(self, path_info, detail=False): + raise RemoteActionNotImplemented("find", self.scheme) + def is_empty(self, path_info): return False diff --git a/dvc/fs/fsspec_wrapper.py b/dvc/fs/fsspec_wrapper.py index 0470b5da62..6d4ca84777 100644 --- a/dvc/fs/fsspec_wrapper.py +++ b/dvc/fs/fsspec_wrapper.py @@ -33,15 +33,13 @@ def _strip_bucket(self, entry): return path or bucket - def _strip_buckets(self, entries, detail, prefix=None): + def _strip_buckets(self, entries, detail=False): for entry in entries: if detail: entry = self._entry_hook(entry.copy()) entry["name"] = self._strip_bucket(entry["name"]) else: - entry = self._strip_bucket( - f"{prefix}/{entry}" if prefix else entry - ) + entry = self._strip_bucket(entry) yield entry def _entry_hook(self, entry): @@ -93,24 +91,29 @@ def copy(self, from_info, to_info): def exists(self, path_info, use_dvcignore=False): return self.fs.exists(self._with_bucket(path_info)) - def ls( - self, path_info, detail=False, recursive=False - ): # pylint: disable=arguments-differ - if self.isdir(path_info) and self.is_empty(path_info): - return None + def ls(self, path_info, detail=False): + path = self._with_bucket(path_info) + files = self.fs.ls(path, detail=detail) + yield from self._strip_buckets(files, detail=detail) + def find(self, path_info, detail=False): path = self._with_bucket(path_info) - if recursive: - for root, _, files in self.fs.walk(path, detail=detail): - if detail: - files = files.values() - yield from self._strip_buckets(files, detail, prefix=root) + files = self.fs.find(path, detail=detail) + if detail: + files = files.values() + + # When calling find() on a file, it returns the same file in a list. + # For object-based storages, the same behavior applies to empty + # directories since they are represented as files. This condition + # checks whether we should yield an empty list (if it is an empty + # directory) or just yield the file itself. + if len(files) == 1 and files[0] == path and self.isdir(path_info): return None - yield from self._strip_buckets(self.ls(path, detail=detail), detail) + yield from self._strip_buckets(files, detail=detail) def walk_files(self, path_info, **kwargs): - for file in self.ls(path_info, recursive=True): + for file in self.find(path_info): yield path_info.replace(path=file) def remove(self, path_info): diff --git a/dvc/fs/gdrive.py b/dvc/fs/gdrive.py index c66b796456..f851cf0b3a 100644 --- a/dvc/fs/gdrive.py +++ b/dvc/fs/gdrive.py @@ -532,7 +532,7 @@ def _gdrive_list_ids(self, query_ids): query = f"({query}) and trashed=false" return self._gdrive_list(query) - def _ls_recursive(self, path_info, detail=False): + def find(self, path_info, detail=False): root_path = path_info.path seen_paths = set() @@ -569,13 +569,7 @@ def _ls_recursive(self, path_info, detail=False): else: yield item_path - def ls( - self, path_info, detail=False, recursive=False - ): # pylint: disable=arguments-differ - if recursive: - yield from self._ls_recursive(path_info, detail=detail) - return None - + def ls(self, path_info, detail=False): cached = path_info.path in self._ids_cache["dirs"] if cached: dir_ids = self._ids_cache["dirs"][path_info.path] @@ -605,8 +599,8 @@ def ls( self._cache_path_id(root_path, *dir_ids) def walk_files(self, path_info, **kwargs): - for filename in self.ls(path_info, recursive=True): - yield path_info.replace(path=filename) + for file in self.find(path_info): + yield path_info.replace(path=file) def remove(self, path_info): item_id = self._get_item_id(path_info) diff --git a/dvc/fs/s3.py b/dvc/fs/s3.py index f201ffeb58..81c8bf53f9 100644 --- a/dvc/fs/s3.py +++ b/dvc/fs/s3.py @@ -293,11 +293,7 @@ def walk_files(self, path_info, **kwargs): yield path_info.replace(path=fname) - def ls( - self, path_info, detail=False, recursive=False - ): # pylint: disable=arguments-differ - assert recursive - + def find(self, path_info, detail=False): with self._get_bucket(path_info.bucket) as bucket: for obj_summary in bucket.objects.filter(Prefix=path_info.path): if detail: diff --git a/dvc/fs/webdav.py b/dvc/fs/webdav.py index b8536eccec..efe16dec7f 100644 --- a/dvc/fs/webdav.py +++ b/dvc/fs/webdav.py @@ -151,33 +151,27 @@ def isdir(self, path_info): # Use webdav is_dir to test whether path points to a directory return self._client.is_dir(path_info.path) - # Yields path info to all files def walk_files(self, path_info, **kwargs): - # Check whether directory exists - if not self.exists(path_info): - return - - # Collect directories - dirs = deque([path_info.path]) + for path in self.find(path_info): + yield path_info.replace(path=path) - # Iterate all directories found so far - while dirs: - # Iterate directory content - for entry in self._client.list(dirs.pop(), get_info=True): - # Construct path_info to entry - info = path_info.replace(path=entry["path"]) - - # Check whether entry is a directory + def ls(self, path_info, detail=False): + for entry in self._client.list(path_info.path): + path = entry["path"] + if detail: if entry["isdir"]: - # Append new found directory to directory list - dirs.append(info.path) + yield {"type": "directory", "name": path} else: - # Yield path info to non directory - yield info + yield { + "type": "file", + "name": path, + "size": entry["size"], + "etag": entry["etag"], + } + else: + yield path - def ls( - self, path_info, detail=False, recursive=False - ): # pylint: disable=arguments-differ + def find(self, path_info, detail=False): dirs = deque([path_info.path]) while dirs: @@ -197,14 +191,6 @@ def ls( else: yield path - if not recursive: - for entry in dirs: - if detail: - yield {"type": "directory", "name": entry} - else: - yield entry - return None - # Removes file/directory def remove(self, path_info): # Use webdav client clean (DELETE) method to remove file/directory diff --git a/dvc/objects/stage.py b/dvc/objects/stage.py index ef29f9a4e6..dd64bf9c62 100644 --- a/dvc/objects/stage.py +++ b/dvc/objects/stage.py @@ -101,7 +101,7 @@ def _build_objects(path_info, fs, name, odb, state, upload, **kwargs): def _iter_objects(path_info, fs, name, odb, state, upload, **kwargs): if not upload and name in fs.DETAIL_FIELDS: - for details in fs.ls(path_info, recursive=True, detail=True): + for details in fs.find(path_info, detail=True): file_info = path_info.replace(path=details["name"]) hash_info = HashInfo( name, details[name], size=details.get("size"), diff --git a/tests/func/test_fs.py b/tests/func/test_fs.py index 84f3658d81..b56268a5b6 100644 --- a/tests/func/test_fs.py +++ b/tests/func/test_fs.py @@ -319,14 +319,13 @@ def test_fs_ls(dvc, cloud): pytest.lazy_fixture("gdrive"), ], ) -def test_fs_ls_recursive(dvc, cloud): +def test_fs_find_recursive(dvc, cloud): cloud.gen({"data": {"foo": "foo", "bar": {"baz": "baz"}, "quux": "quux"}}) fs = get_cloud_fs(dvc, **cloud.config) path_info = fs.path_info assert { - os.path.basename(file_key) - for file_key in fs.ls(path_info / "data", recursive=True) + os.path.basename(file_key) for file_key in fs.find(path_info / "data") } == {"foo", "baz", "quux"} @@ -339,12 +338,12 @@ def test_fs_ls_recursive(dvc, cloud): pytest.lazy_fixture("webdav"), ], ) -def test_fs_ls_with_etag(dvc, cloud): +def test_fs_find_with_etag(dvc, cloud): cloud.gen({"data": {"foo": "foo", "bar": {"baz": "baz"}, "quux": "quux"}}) fs = get_cloud_fs(dvc, **cloud.config) path_info = fs.path_info - for details in fs.ls(path_info / "data", recursive=True, detail=True): + for details in fs.find(path_info / "data", detail=True): assert ( fs.info(path_info.replace(path=details["name"]))["etag"] == details["etag"]