Skip to content

Commit

Permalink
Update directory hasher
Browse files Browse the repository at this point in the history
  • Loading branch information
CBroz1 committed Nov 13, 2024
1 parent 23799f8 commit d0011bf
Showing 1 changed file with 84 additions and 34 deletions.
118 changes: 84 additions & 34 deletions src/spyglass/utils/nwb_hash.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import atexit
import json
from hashlib import md5
from pathlib import Path
from typing import Any, Union
Expand All @@ -8,37 +9,91 @@
from tqdm import tqdm

DEFAULT_BATCH_SIZE = 4095
IGNORED_KEYS = ["version"]


def hash_directory(directory_path: str, batch_size: int = DEFAULT_BATCH_SIZE):
"""Generate a hash of the contents of a directory, recursively.
Searches though all files in the directory and subdirectories, hashing
the contents of files. nwb files are hashed with the NwbfileHasher class.
Parameters
----------
directory_path : str
Path to the directory to hash.
batch_size : int, optional
Limit of data to hash for large files, by default 4095.
"""
hash_obj = md5()

for file_path in sorted(Path(directory_path).rglob("*")):
if not file_path.is_file(): # Only hash files, not directories
continue
if file_path.suffix == ".nwb":
hasher = NwbfileHasher(file_path, batch_size=batch_size)
hash_obj.update(hasher.hash.encode())
continue
class DirectoryHasher:
def __init__(
self,
directory_path: Union[str, Path],
batch_size: int = DEFAULT_BATCH_SIZE,
verbose: bool = False,
):
"""Generate a hash of the contents of a directory, recursively.
Searches though all files in the directory and subdirectories, hashing
the contents of files. nwb files are hashed with the NwbfileHasher
class. JSON files are hashed by encoding the contents, ignoring
specific keys, like 'version'. All other files are hashed by reading
the file in chunks.
If the contents of a json file is otherwise the same, but the 'version'
value is different, we assume that the dependency change had no effect
on the data and ignore the difference.
Parameters
----------
directory_path : str
Path to the directory to hash.
batch_size : int, optional
Limit of data to hash for large files, by default 4095.
"""

self.dir_path = Path(directory_path)
self.batch_size = batch_size
self.verbose = verbose
self.hashed = md5("".encode())
self.hash = self.compute_hash()

def compute_hash(self) -> str:
"""Hashes the contents of the directory, recursively."""
all_files = [f for f in sorted(self.dir_path.rglob("*")) if f.is_file()]

for file_path in tqdm(all_files, disable=not self.verbose):
if file_path.suffix == ".nwb":
hasher = NwbfileHasher(file_path, batch_size=batch_size)
self.hashed.update(hasher.hash.encode())
elif file_path.suffix == ".json":
self.hashed.update(self.json_encode(file_path))
else:
self.chunk_encode(file_path)

# update with the rel path to for same file in diff dirs
rel_path = str(file_path.relative_to(self.dir_path))
self.hashed.update(rel_path.encode())

if self.verbose:
print(f"{file_path.name}: {self.hased.hexdigest()}")

return self.hashed.hexdigest() # Return the hex digest of the hash

def chunk_encode(self, file_path: Path) -> str:
"""Encode the contents of a file in chunks for hashing."""
with file_path.open("rb") as f:
while chunk := f.read(batch_size):
hash_obj.update(chunk)
# update with the rel path to for same file in diff dirs
hash_obj.update(str(file_path.relative_to(directory_path)).encode())
while chunk := f.read(self.batch_size):
self.hashed.update(chunk)

def json_encode(self, file_path: Path) -> str:
"""Encode the contents of a json file for hashing.
return hash_obj.hexdigest() # Return the hex digest of the hash
Ignores the 'version' key(s) in the json file.
"""
with file_path.open("r") as f:
file_data = json.load(f, object_hook=self.pop_version)
return json.dumps(file_data, sort_keys=True).encode()

def pop_version(self, data: Union[dict, list]) -> Union[dict, list]:
"""Recursively remove banned keys from any nested dicts/lists."""
if isinstance(data, dict):
return {
k: self.pop_version(v)
for k, v in data.items()
if k not in IGNORED_KEYS
}
elif isinstance(data, list):
return [self.pop_version(item) for item in data]
else:
return data


class NwbfileHasher:
Expand All @@ -50,11 +105,6 @@ def __init__(
):
"""Hashes the contents of an NWB file, limiting to partial data.
In testing, chunking the data for large datasets caused false positives
in the hash comparison, and some datasets may be too large to store in
memory. This method limits the data to the first N elements to avoid
this issue, and may not be suitable for all datasets.
Parameters
----------
path : Union[str, Path]
Expand Down Expand Up @@ -105,7 +155,7 @@ def serialize_attr_value(self, value: Any):
Serialized bytes of the attribute value.
"""
if isinstance(value, np.ndarray):
return value.astype(str).tobytes() # Try with and without `str`
return value.astype(str).tobytes() # must be 'astype(str)'
elif isinstance(value, (str, int, float)):
return str(value).encode()
return repr(value).encode() # For other data types, use repr
Expand Down Expand Up @@ -140,6 +190,7 @@ def compute_hash(self) -> str:
disable=not self.verbose,
):
self.hashed.update(name.encode())

for attr_key in sorted(obj.attrs):
attr_value = obj.attrs[attr_key]
_ = self.hash_shape_dtype(attr_value)
Expand All @@ -149,7 +200,6 @@ def compute_hash(self) -> str:
if isinstance(obj, h5py.Dataset):
_ = self.hash_dataset(obj)
elif isinstance(obj, h5py.SoftLink):
# TODO: Check that this works
self.hashed.update(obj.path.encode())
elif isinstance(obj, h5py.Group):
for k, v in obj.items():
Expand Down

0 comments on commit d0011bf

Please sign in to comment.