invert, output

CZ-NIC · Mar 14, 2024 · e0620f9 · e0620f9
1 parent 27edb29
commit e0620f9
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -52,23 +52,26 @@ from deduplidog import Deduplidog
 Deduplidog("/home/user/duplicates", "/media/disk/origs", ignore_date=True, rename=True)
 ```
 
+This command produced the following output:
+
 ```
 Find files by size, ignoring: date, crc32
 Duplicates from the work dir at 'home' would be (if execute were True) renamed (prefixed with ✓).
 Number of originals: 38
 * /home/user/duplicates/foo.txt
   /media/disk/origs/foo.txt
   🔨home: renamable
-  📄media: DATE WARNING + a day
-Affectable: 38/38
-Affected size: 59.9 kB
+  📄media: DATE WARNING + a day 🛟skipped on warning
+Affectable: 37/38
+Affected size: 56.9 kB
 Warnings: 1
 ```
 
-We found out all the files in the *duplicates* folder seem to be useless but one. It's date is earlier than the original one. See with full log.
+We found out all the files in the *duplicates* folder seem to be useless but one. It's date is earlier than the original one. The life buoy icon would prevent any action. To suppress this, let's turn on `set_both_to_older_date`. See with full log.
 
 ```python3
-Deduplidog("/home/user/duplicates", "/media/disk/origs", ignore_date=True, rename=True, set_both_to_older_date=True, log_level=logging.INFO)
+Deduplidog("/home/user/duplicates", "/media/disk/origs",
+   ignore_date=True, rename=True, set_both_to_older_date=True, log_level=logging.INFO)
 ```
 
 ```
@@ -94,7 +97,8 @@ Affected size: 59.9 kB
 You see, the log is at the most brief, yet transparent form. The files to be affected at the work folder are prepended with the 🔨 icon whereas those affected at the original folder uses 📄 icon. We might add `execute=True` parameter to perform the actions. Or use `bashify=True` to inspect.
 
 ```python3
-Deduplidog("/home/user/duplicates", "/media/disk/origs", ignore_date=True, rename=True, set_both_to_older_date=True, bashify=True)
+Deduplidog("/home/user/duplicates", "/media/disk/origs",
+  ignore_date=True, rename=True, set_both_to_older_date=True, bashify=True)
 ```
 
 The `bashify=True` just produces the commands we might use.
@@ -146,13 +150,15 @@ Find the duplicates. Normally, the file must have the same size, date and name.
 | strip_end_counter | bool | False | When comparing files in work_dir, strip the counter. Ex: "00034(3).MTS" is compared as "00034.MTS"  |
 | strip_suffix | str | False | When comparing files in work_dir, strip the file name end matched by a regular. Ex: "001-edited.jpg" is compared as "001.jpg"  |
 | work_file_stem_shortened | int | None | Photos downloaded from Google have its stem shortened to 47 chars. For the comparing purpose, treat original folder file names shortened. |
+| invert_selection | bool | False | Match only those files from work_dir that does not match the criterions. |
 | **Media** |
 | media_magic | bool | False | Nor the size or date is compared for files with media suffixes.<br>A video is considered a duplicate if it has the same name and a similar number of frames, even if it has a different extension.<br>An image is considered a duplicate if it has the same name and a similar image hash, even if the files are of different sizes.<br>(This mode is considerably slower.) |
 | accepted_frame_delta | int | 1 | Used only when media_magic is True |
 | accepted_img_hash_diff | int | 1 | Used only when media_magic is True |
 | img_compare_date | bool | False | If True and `media_magic=True`, the work file date or the work file EXIF date must match the original file date (has to be no more than an hour around). |
 | **Helper** |
 | log_level | int | 30 (warning) | 10 debug .. 50 critical |
+| output | bool | False | Stores the output log to a file in the current working directory. (Never overwrites an older file.) |
 
 ## Utils
 In the `deduplidog.utils` packages, you'll find a several handsome tools to help you. You will find parameters by using you IDE hints.

diff --git a/deduplidog/deduplidog.py b/deduplidog/deduplidog.py
@@ -1,3 +1,4 @@
+from contextlib import redirect_stdout
 import logging
 import os
 import re
@@ -19,7 +20,7 @@
 from tqdm.autonotebook import tqdm
 
 from .helpers import Field, FileMetadata, keydefaultdict
-from .utils import _qp, crc, get_frame_count
+from .utils import _qp, crc, get_frame_count, open_log_file
 
 VIDEO_SUFFIXES = ".mp4", ".mov", ".avi", ".vob", ".mts", ".3gp", ".mpg", ".mpeg", ".wmv", ".hevc"
 IMAGE_SUFFIXES = ".jpg", ".jpeg", ".png", ".gif", ".avif", ".webp", ".heic", ".avif"
@@ -122,6 +123,8 @@ class Deduplidog:
         """When comparing files in work_dir, strip the file name end matched by a regular. Ex: "001-edited.jpg" is compared as "001.jpg" """, False)] = False
     work_file_stem_shortened: Annotated[int, opt(
         "Photos downloaded from Google have its stem shortened to 47 chars. For the comparing purpose, treat original folder file names shortened.", None)] = None
+    invert_selection: Annotated[bool, flag(
+        "Match only those files from work_dir that does not match the criterions.")] = False
 
     # Media section
     media_magic: Annotated[bool, flag(
@@ -139,8 +142,10 @@ class Deduplidog:
 
     # Helper section
     log_level: Annotated[int, opt("10 debug .. 50 critical", logging.WARNING, 1)] = logging.WARNING
+    output: Annotated[bool, flag(
+        "Stores the output log to a file in the current working directory. (Never overwrites an older file.)")] = False
 
-    # TODO output of log and of bashize should be outputtable to a file
+    # TODO bashize should be outputtable through output
 
     # Following parameters are undocumented:
 
@@ -193,6 +198,10 @@ def __post_init__(self):
         " TODO deprecated"
         self.original_dir_name = self.work_dir_name = None
         "Shortened name, human readable"
+        self.same_superdir = False
+        """ Work_dir and original dir is the same """
+        self._output = None
+        " Log buffer "
 
         self.check()
         self.perform()
@@ -221,11 +230,17 @@ def perform(self):
         self._common_prefix_length = len(os.path.commonprefix([self.original_dir, self.work_dir])) \
             if self.shorter_log else 0
 
+        if self.output:
+            name = ",".join([self.original_dir_name, self.work_dir_name] +
+                            [p for p, v in vars(self).items() if v is True])[:150]
+            self._output = open_log_file(name)
         try:
             self._loop_files()
         except:
             raise
         finally:
+            if self._output:
+                self._output.close()
             if self.bar:
                 print(f"{'Affected' if self.execute else 'Affectable'}:"
                       f" {self.affected_count}/{len(self.file_list)- self.ignored_count}", end="")
@@ -257,18 +272,23 @@ def check(self):
         if not self.work_dir:
             raise AssertionError("Missing work_dir")
         else:
+            self.same_superdir = False
             for a, b in zip(Path(self.work_dir).parts, Path(self.original_dir).parts):
                 if a != b:
                     self.work_dir_name = a
                     self.original_dir_name = b
                     break
             else:
-                self.work_dir_name = a
-                self.original_dir_name = "(same superdir)"
+                self.same_superdir = True
+                self.original_dir_name = self.work_dir_name = a
 
         if self.skip_bigger and not self.media_magic:
             raise AssertionError("The skip_bigger works only with media_magic")
 
+        if self.invert_selection and any((self.replace_with_original, self.treat_bigger_as_original, self.set_both_to_older_date)):
+            raise AssertionError(
+                "It does not make sense using invert_selection with this command. The work file has no file to compare to.")
+
         match self.tolerate_hour:
             case True:
                 self.tolerate_hour = -1, 1
@@ -295,7 +315,8 @@ def check(self):
                 self.checksum and ("crc32", "") or ("", "crc32")))
             print(f"Find files by {used}{f', ignoring: {ignored}' if ignored else ''}")
 
-        which = f"either the file from the work dir at '{self.work_dir_name}' or the original dir at '{self.original_dir_name}' (whichever is bigger)" \
+        dirs_ = "" if self.same_superdir else f" at '{self.work_dir_name}' or the original dir at '{self.original_dir_name}'"
+        which = f"either the file from the work dir{dirs_} (whichever is bigger)" \
             if self.treat_bigger_as_original \
             else f"duplicates from the work dir at '{self.work_dir_name}'"
         small = " (only if smaller than the pair file)" if self.skip_bigger else ""
@@ -321,7 +342,8 @@ def check(self):
 
     def _loop_files(self):
         work_dir, skip = self.work_dir, self.skip
-        work_files = [f for f in tqdm(Path(work_dir).rglob("*"), desc="Caching working files")]
+        work_files = [f for f in tqdm((p for p in Path(work_dir).rglob(
+            "*") if not p.is_dir()), desc="Caching working files")]
         if skip:
             if isinstance(work_files, list):
                 work_files = work_files[skip:]
@@ -398,8 +420,10 @@ def _process_file(self, work_file: Path, bar: tqdm):
 
         # original of the work_file has been found
         # one of them might be treated as a duplicate and thus affected
-        if original:
+        if original and not self.invert_selection:
             self._affect(work_file, original)
+        elif not original and self.invert_selection:
+            self._affect(work_file, Path("/dev/null"))
         elif len(candidates) > 1:  # we did not find the object amongst multiple candidates
             self.having_multiple_candidates[work_file] = candidates
             logger.debug("Candidates %s %s", work_file, candidates)
@@ -467,6 +491,9 @@ def _affect(self, work_file: Path, original: Path):
         if (warning and self.log_level <= logging.WARNING) or (self.log_level <= logging.INFO):
             self.bar.clear()  # this looks the same from jupyter and much better from terminal (does not leave a trace of abandoned bars)
             self._print_change(change)
+        if self._output:
+            with redirect_stdout(self._output):
+                self._print_change(change)
 
     def _rename(self, change: Change, affected_file: Path):
         msg = "renamable"
@@ -616,10 +643,24 @@ def print_changes(self):
         [self._print_change(change) for change in self.changes]
 
     def _print_change(self, change: Change):
+        """ We aim for the clearest representation to help the user orientate at a glance.
+        Because file paths can be long, we'll display them as succinctly as possible.
+        Sometimes we'll use, for example, the disk name, other times we'll use file names,
+        or the first or last differing part of the path. """
         wicon, oicon = "🔨", "📄"
         wf, of = change
+
+        # Nice paths
+        wn, on = self.work_dir_name, self.original_dir_name  # meaningful dir representation
+        if self.same_superdir:
+            if wf.name == of.name:  # full path that makes the difference
+                len_ = len(os.path.commonprefix((wf, of)))
+                wn, on = str(wf.parent)[len_:] or "(basedir)", str(of.parent)[len_:] or "(basedir)"
+            else:  # the file name will make the meaningful difference
+                wn, on = wf.name, of.name
+
         print("*", wf)
         print(" ", of)
         [print(text, *(str(s) for s in changes))
-            for text, changes in zip((f"  {wicon}{self.work_dir_name}:",
-                                      f"  {oicon}{self.original_dir_name}:"), change.values()) if len(changes)]
+            for text, changes in zip((f"  {wicon}{wn}:",
+                                      f"  {oicon}{on}:"), change.values()) if len(changes)]
diff --git a/deduplidog/utils.py b/deduplidog/utils.py
@@ -39,6 +39,18 @@ def _qp(path: Path):
     s = str(path)
     return f'"{s}"' if " " in s else s
 
+def open_log_file(name):  # undocumented functions
+    log_file_path = Path(f"{name}.log")
+    try:
+        return log_file_path.open("x")
+    except FileExistsError:
+        counter = 1
+        while True:
+            new_file_path = Path(f"{name} ({counter}).log")
+            try:
+                return new_file_path.open("x")
+            except FileExistsError:
+                counter += 1
 
 def images(urls: Iterable[str | Path]):
     """ Display a ribbon of images. """

diff --git a/tests.py b/tests.py
@@ -74,15 +74,15 @@ def check(self, prefixed: tuple[int] = None, suck: tuple[int] = None):
 class TestDeduplidog(TestCase):
 
     def prepare(self, testing_dir: str = None):
-        self.temp = TemporaryDirectory()
-        temp = Path(testing_dir) if testing_dir else self.temp.name
+        self.temp = mkdtemp()  # TemporaryDirectory() TODO
+        # temp = Path(testing_dir) if testing_dir else self.temp.name TODO
+        temp = str(self.temp)
         originals = Path(temp, "originals")
         work_dir = Path(temp, "work_dir")
         if not testing_dir:
             originals.mkdir()
             work_dir.mkdir()
 
-        # c = FileRepresentationController(temp)
         original_files = {name: FileRepresentation(originals / name).write()
                           for name in (f"file_{i}" for i in range(12))}
         work_files = {name: FileRepresentation(work_dir / name, *rest).write() for name, *rest in (
@@ -107,27 +107,47 @@ def test_simple_prefix(self):
 
     def test_date(self):
         state = self.prepare()
-        Deduplidog(*state, rename=True, execute=True, ignore_date=True)
+        Deduplidog(*state, rename=True, execute=True, ignore_date=True, neglect_warning=True)
         state.check(prefixed=(4, 5, 6, 7, 8, 9, 10, 11))
+        state = self.prepare()
+        Deduplidog(*state, rename=True, execute=True, ignore_date=True)
+        state.check(prefixed=(4, 5, 6, 7, 11))
 
         state = self.prepare()
-        Deduplidog(*state, rename=True, execute=True, tolerate_hour=1)
+        Deduplidog(*state, rename=True, execute=True, tolerate_hour=1, neglect_warning=True)
         state.check(prefixed=(4, 7, 8, 9, 11))
+        state = self.prepare()
+        Deduplidog(*state, rename=True, execute=True, tolerate_hour=1)
+        state.check(prefixed=(4, 7, 11))
 
         state = self.prepare()
-        Deduplidog(*state, rename=True, execute=True, tolerate_hour=2)
+        Deduplidog(*state, rename=True, execute=True, tolerate_hour=2, neglect_warning=True)
         state.check(prefixed=(4, 5, 6, 7, 8, 9, 11))
+        state = self.prepare()
+        Deduplidog(*state, rename=True, execute=True, tolerate_hour=2)
+        state.check(prefixed=(4, 5, 6, 7, 11))
 
     def test_replace_with_original(self):
         state = self.prepare()
-        Deduplidog(*state, replace_with_original=True, execute=True)
+        Deduplidog(*state, replace_with_original=True, execute=True, neglect_warning=True)
         state.work_files["file_11"].suck(state.originals["file_11"])
         state.check()
 
         state = self.prepare()
-        Deduplidog(*state, replace_with_original=True, execute=True, tolerate_hour=2)
+        Deduplidog(*state, replace_with_original=True, execute=True, tolerate_hour=2, neglect_warning=True)
         state.check(suck=(4, 5, 6, 7, 8, 9, 11))
 
+    def test_invert_selection(self):
+        state = self.prepare()
+        self.assertRaises(AssertionError, Deduplidog,
+                          *state, replace_with_original=True, execute=True, tolerate_hour=2,  invert_selection=True)
+        Deduplidog(*state, rename=True, execute=True, tolerate_hour=2,  neglect_warning=True, invert_selection=False)
+        state.check(prefixed=(4, 5, 6, 7, 8, 9, 11))
+
+        state = self.prepare()
+        Deduplidog(*state, rename=True, execute=True, tolerate_hour=2,  neglect_warning=True, invert_selection=True)
+        state.check(prefixed=(1, 2, 10))
+
     #  No media file in the test case.
     # def test_skip_bigger(self):
     #     state = self.prepare()