Skip to content

Commit

Permalink
More performant walk implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
pjbull committed Oct 8, 2023
1 parent 07bcb16 commit d8fadc3
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 3 deletions.
42 changes: 40 additions & 2 deletions cloudpathlib/cloudpath.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def _glob_checks(self, pattern: str) -> None:
".glob is only supported within a bucket or container; you can use `.iterdir` to list buckets; for example, CloudPath('s3://').iterdir()"
)

def _glob(self, selector, recursive: bool) -> Generator[Self, None, None]:
def _build_subtree(self, recursive):
# build a tree structure for all files out of default dicts
Tree: Callable = lambda: defaultdict(Tree)

Expand All @@ -443,7 +443,10 @@ def _build_tree(trunk, branch, nodes, is_dir):
nodes = (p for p in parts)
_build_tree(file_tree, next(nodes, None), nodes, is_dir)

file_tree = dict(file_tree) # freeze as normal dict before passing in
return dict(file_tree) # freeze as normal dict before passing in

def _glob(self, selector, recursive: bool) -> Generator[Self, None, None]:
file_tree = self._build_subtree(recursive)

root = _CloudPathSelectable(
self.name,
Expand Down Expand Up @@ -489,6 +492,41 @@ def iterdir(self) -> Generator[Self, None, None]:
if f != self: # iterdir does not include itself in pathlib
yield f

@staticmethod
def _walk_results_from_tree(root, tree, top_down=True):
""" Utility to yield tuples in the form expected by `.walk` from the file
tree constructed by `_build_substree`.
"""
dirs = []
files = []
for item, branch in tree.items():
files.append(item) if branch is None else dirs.append(item)

if top_down:
yield root, dirs, files

for dir in dirs:
yield from CloudPath._walk_results_from_tree(root / dir, tree[dir], top_down=top_down)

if not top_down:
yield root, dirs, files

def walk(
self,
top_down: bool = True,
on_error: Optional[Callable] = None,
follow_symlinks: bool = False,
) -> Generator[Tuple[Self, List[str], List[str]], None, None]:
try:
file_tree = self._build_subtree(recursive=True) # walking is always recursive
yield from self._walk_results_from_tree(self, file_tree, top_down=top_down)

except Exception as e:
if on_error is not None:
on_error(e)
else:
raise

def open(
self,
mode: str = "r",
Expand Down
9 changes: 9 additions & 0 deletions tests/performance/perf_file_listing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,12 @@ def glob(folder, recursive):
return {"n_items": len(list(folder.rglob("*.item")))}
else:
return {"n_items": len(list(folder.glob("*.item")))}


def walk(folder):
n_items = 0

for _, _, files in folder.walk():
n_items += len(files)

return {"n_items": n_items}
11 changes: 10 additions & 1 deletion tests/performance/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from cloudpathlib import CloudPath


from perf_file_listing import folder_list, glob
from perf_file_listing import folder_list, glob, walk


# make loguru and tqdm play nicely together
Expand Down Expand Up @@ -137,6 +137,15 @@ def main(root, iterations, burn_in):
PerfRunConfig(name="Glob deep non-recursive", args=[deep, False], kwargs={}),
],
),
(
"Walk scenarios",
walk,
[
PerfRunConfig(name="Walk shallow", args=[shallow], kwargs={}),
PerfRunConfig(name="Walk normal", args=[normal], kwargs={}),
PerfRunConfig(name="Walk deep", args=[deep], kwargs={}),
],
),
]

logger.info(
Expand Down

0 comments on commit d8fadc3

Please sign in to comment.