Update directory hasher

LorenFrankLab · Nov 13, 2024 · d0011bf · d0011bf
1 parent 23799f8
commit d0011bf
Showing 1 changed file with 84 additions and 34 deletions.
diff --git a/src/spyglass/utils/nwb_hash.py b/src/spyglass/utils/nwb_hash.py
@@ -1,4 +1,5 @@
 import atexit
+import json
 from hashlib import md5
 from pathlib import Path
 from typing import Any, Union
@@ -8,37 +9,91 @@
 from tqdm import tqdm
 
 DEFAULT_BATCH_SIZE = 4095
+IGNORED_KEYS = ["version"]
 
 
-def hash_directory(directory_path: str, batch_size: int = DEFAULT_BATCH_SIZE):
-    """Generate a hash of the contents of a directory, recursively.
-
-    Searches though all files in the directory and subdirectories, hashing
-    the contents of files. nwb files are hashed with the NwbfileHasher class.
-
-    Parameters
-    ----------
-    directory_path : str
-        Path to the directory to hash.
-    batch_size : int, optional
-        Limit of data to hash for large files, by default 4095.
-    """
-    hash_obj = md5()
-
-    for file_path in sorted(Path(directory_path).rglob("*")):
-        if not file_path.is_file():  # Only hash files, not directories
-            continue
-        if file_path.suffix == ".nwb":
-            hasher = NwbfileHasher(file_path, batch_size=batch_size)
-            hash_obj.update(hasher.hash.encode())
-            continue
+class DirectoryHasher:
+    def __init__(
+        self,
+        directory_path: Union[str, Path],
+        batch_size: int = DEFAULT_BATCH_SIZE,
+        verbose: bool = False,
+    ):
+        """Generate a hash of the contents of a directory, recursively.
+
+        Searches though all files in the directory and subdirectories, hashing
+        the contents of files. nwb files are hashed with the NwbfileHasher
+        class. JSON files are hashed by encoding the contents, ignoring
+        specific keys, like 'version'. All other files are hashed by reading
+        the file in chunks.
+
+        If the contents of a json file is otherwise the same, but the 'version'
+        value is different, we assume that the dependency change had no effect
+        on the data and ignore the difference.
+
+        Parameters
+        ----------
+        directory_path : str
+            Path to the directory to hash.
+        batch_size : int, optional
+            Limit of data to hash for large files, by default 4095.
+        """
+
+        self.dir_path = Path(directory_path)
+        self.batch_size = batch_size
+        self.verbose = verbose
+        self.hashed = md5("".encode())
+        self.hash = self.compute_hash()
+
+    def compute_hash(self) -> str:
+        """Hashes the contents of the directory, recursively."""
+        all_files = [f for f in sorted(self.dir_path.rglob("*")) if f.is_file()]
+
+        for file_path in tqdm(all_files, disable=not self.verbose):
+            if file_path.suffix == ".nwb":
+                hasher = NwbfileHasher(file_path, batch_size=batch_size)
+                self.hashed.update(hasher.hash.encode())
+            elif file_path.suffix == ".json":
+                self.hashed.update(self.json_encode(file_path))
+            else:
+                self.chunk_encode(file_path)
+
+            # update with the rel path to for same file in diff dirs
+            rel_path = str(file_path.relative_to(self.dir_path))
+            self.hashed.update(rel_path.encode())
+
+            if self.verbose:
+                print(f"{file_path.name}: {self.hased.hexdigest()}")
+
+        return self.hashed.hexdigest()  # Return the hex digest of the hash
+
+    def chunk_encode(self, file_path: Path) -> str:
+        """Encode the contents of a file in chunks for hashing."""
         with file_path.open("rb") as f:
-            while chunk := f.read(batch_size):
-                hash_obj.update(chunk)
-        # update with the rel path to for same file in diff dirs
-        hash_obj.update(str(file_path.relative_to(directory_path)).encode())
+            while chunk := f.read(self.batch_size):
+                self.hashed.update(chunk)
+
+    def json_encode(self, file_path: Path) -> str:
+        """Encode the contents of a json file for hashing.
 
-    return hash_obj.hexdigest()  # Return the hex digest of the hash
+        Ignores the 'version' key(s) in the json file.
+        """
+        with file_path.open("r") as f:
+            file_data = json.load(f, object_hook=self.pop_version)
+        return json.dumps(file_data, sort_keys=True).encode()
+
+    def pop_version(self, data: Union[dict, list]) -> Union[dict, list]:
+        """Recursively remove banned keys from any nested dicts/lists."""
+        if isinstance(data, dict):
+            return {
+                k: self.pop_version(v)
+                for k, v in data.items()
+                if k not in IGNORED_KEYS
+            }
+        elif isinstance(data, list):
+            return [self.pop_version(item) for item in data]
+        else:
+            return data
 
 
 class NwbfileHasher:
@@ -50,11 +105,6 @@ def __init__(
     ):
         """Hashes the contents of an NWB file, limiting to partial data.
 
-        In testing, chunking the data for large datasets caused false positives
-        in the hash comparison, and some datasets may be too large to store in
-        memory. This method limits the data to the first N elements to avoid
-        this issue, and may not be suitable for all datasets.
-
         Parameters
         ----------
         path : Union[str, Path]
@@ -105,7 +155,7 @@ def serialize_attr_value(self, value: Any):
             Serialized bytes of the attribute value.
         """
         if isinstance(value, np.ndarray):
-            return value.astype(str).tobytes()  # Try with and without `str`
+            return value.astype(str).tobytes()  # must be 'astype(str)'
         elif isinstance(value, (str, int, float)):
             return str(value).encode()
         return repr(value).encode()  # For other data types, use repr
@@ -140,6 +190,7 @@ def compute_hash(self) -> str:
             disable=not self.verbose,
         ):
             self.hashed.update(name.encode())
+
             for attr_key in sorted(obj.attrs):
                 attr_value = obj.attrs[attr_key]
                 _ = self.hash_shape_dtype(attr_value)
@@ -149,7 +200,6 @@ def compute_hash(self) -> str:
             if isinstance(obj, h5py.Dataset):
                 _ = self.hash_dataset(obj)
             elif isinstance(obj, h5py.SoftLink):
-                # TODO: Check that this works
                 self.hashed.update(obj.path.encode())
             elif isinstance(obj, h5py.Group):
                 for k, v in obj.items():