Skip to content

Commit

Permalink
invert, output
Browse files Browse the repository at this point in the history
  • Loading branch information
e3rd committed Mar 14, 2024
1 parent 27edb29 commit e0620f9
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 23 deletions.
18 changes: 12 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,23 +52,26 @@ from deduplidog import Deduplidog
Deduplidog("/home/user/duplicates", "/media/disk/origs", ignore_date=True, rename=True)
```

This command produced the following output:

```
Find files by size, ignoring: date, crc32
Duplicates from the work dir at 'home' would be (if execute were True) renamed (prefixed with ✓).
Number of originals: 38
* /home/user/duplicates/foo.txt
/media/disk/origs/foo.txt
🔨home: renamable
📄media: DATE WARNING + a day
Affectable: 38/38
Affected size: 59.9 kB
📄media: DATE WARNING + a day 🛟skipped on warning
Affectable: 37/38
Affected size: 56.9 kB
Warnings: 1
```

We found out all the files in the *duplicates* folder seem to be useless but one. It's date is earlier than the original one. See with full log.
We found out all the files in the *duplicates* folder seem to be useless but one. It's date is earlier than the original one. The life buoy icon would prevent any action. To suppress this, let's turn on `set_both_to_older_date`. See with full log.

```python3
Deduplidog("/home/user/duplicates", "/media/disk/origs", ignore_date=True, rename=True, set_both_to_older_date=True, log_level=logging.INFO)
Deduplidog("/home/user/duplicates", "/media/disk/origs",
ignore_date=True, rename=True, set_both_to_older_date=True, log_level=logging.INFO)
```

```
Expand All @@ -94,7 +97,8 @@ Affected size: 59.9 kB
You see, the log is at the most brief, yet transparent form. The files to be affected at the work folder are prepended with the 🔨 icon whereas those affected at the original folder uses 📄 icon. We might add `execute=True` parameter to perform the actions. Or use `bashify=True` to inspect.

```python3
Deduplidog("/home/user/duplicates", "/media/disk/origs", ignore_date=True, rename=True, set_both_to_older_date=True, bashify=True)
Deduplidog("/home/user/duplicates", "/media/disk/origs",
ignore_date=True, rename=True, set_both_to_older_date=True, bashify=True)
```

The `bashify=True` just produces the commands we might use.
Expand Down Expand Up @@ -146,13 +150,15 @@ Find the duplicates. Normally, the file must have the same size, date and name.
| strip_end_counter | bool | False | When comparing files in work_dir, strip the counter. Ex: "00034(3).MTS" is compared as "00034.MTS" |
| strip_suffix | str | False | When comparing files in work_dir, strip the file name end matched by a regular. Ex: "001-edited.jpg" is compared as "001.jpg" |
| work_file_stem_shortened | int | None | Photos downloaded from Google have its stem shortened to 47 chars. For the comparing purpose, treat original folder file names shortened. |
| invert_selection | bool | False | Match only those files from work_dir that does not match the criterions. |
| **Media** |
| media_magic | bool | False | Nor the size or date is compared for files with media suffixes.<br>A video is considered a duplicate if it has the same name and a similar number of frames, even if it has a different extension.<br>An image is considered a duplicate if it has the same name and a similar image hash, even if the files are of different sizes.<br>(This mode is considerably slower.) |
| accepted_frame_delta | int | 1 | Used only when media_magic is True |
| accepted_img_hash_diff | int | 1 | Used only when media_magic is True |
| img_compare_date | bool | False | If True and `media_magic=True`, the work file date or the work file EXIF date must match the original file date (has to be no more than an hour around). |
| **Helper** |
| log_level | int | 30 (warning) | 10 debug .. 50 critical |
| output | bool | False | Stores the output log to a file in the current working directory. (Never overwrites an older file.) |

## Utils
In the `deduplidog.utils` packages, you'll find a several handsome tools to help you. You will find parameters by using you IDE hints.
Expand Down
59 changes: 50 additions & 9 deletions deduplidog/deduplidog.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from contextlib import redirect_stdout
import logging
import os
import re
Expand All @@ -19,7 +20,7 @@
from tqdm.autonotebook import tqdm

from .helpers import Field, FileMetadata, keydefaultdict
from .utils import _qp, crc, get_frame_count
from .utils import _qp, crc, get_frame_count, open_log_file

VIDEO_SUFFIXES = ".mp4", ".mov", ".avi", ".vob", ".mts", ".3gp", ".mpg", ".mpeg", ".wmv", ".hevc"
IMAGE_SUFFIXES = ".jpg", ".jpeg", ".png", ".gif", ".avif", ".webp", ".heic", ".avif"
Expand Down Expand Up @@ -122,6 +123,8 @@ class Deduplidog:
"""When comparing files in work_dir, strip the file name end matched by a regular. Ex: "001-edited.jpg" is compared as "001.jpg" """, False)] = False
work_file_stem_shortened: Annotated[int, opt(
"Photos downloaded from Google have its stem shortened to 47 chars. For the comparing purpose, treat original folder file names shortened.", None)] = None
invert_selection: Annotated[bool, flag(
"Match only those files from work_dir that does not match the criterions.")] = False

# Media section
media_magic: Annotated[bool, flag(
Expand All @@ -139,8 +142,10 @@ class Deduplidog:

# Helper section
log_level: Annotated[int, opt("10 debug .. 50 critical", logging.WARNING, 1)] = logging.WARNING
output: Annotated[bool, flag(
"Stores the output log to a file in the current working directory. (Never overwrites an older file.)")] = False

# TODO output of log and of bashize should be outputtable to a file
# TODO bashize should be outputtable through output

# Following parameters are undocumented:

Expand Down Expand Up @@ -193,6 +198,10 @@ def __post_init__(self):
" TODO deprecated"
self.original_dir_name = self.work_dir_name = None
"Shortened name, human readable"
self.same_superdir = False
""" Work_dir and original dir is the same """
self._output = None
" Log buffer "

self.check()
self.perform()
Expand Down Expand Up @@ -221,11 +230,17 @@ def perform(self):
self._common_prefix_length = len(os.path.commonprefix([self.original_dir, self.work_dir])) \
if self.shorter_log else 0

if self.output:
name = ",".join([self.original_dir_name, self.work_dir_name] +
[p for p, v in vars(self).items() if v is True])[:150]
self._output = open_log_file(name)
try:
self._loop_files()
except:
raise
finally:
if self._output:
self._output.close()
if self.bar:
print(f"{'Affected' if self.execute else 'Affectable'}:"
f" {self.affected_count}/{len(self.file_list)- self.ignored_count}", end="")
Expand Down Expand Up @@ -257,18 +272,23 @@ def check(self):
if not self.work_dir:
raise AssertionError("Missing work_dir")
else:
self.same_superdir = False
for a, b in zip(Path(self.work_dir).parts, Path(self.original_dir).parts):
if a != b:
self.work_dir_name = a
self.original_dir_name = b
break
else:
self.work_dir_name = a
self.original_dir_name = "(same superdir)"
self.same_superdir = True
self.original_dir_name = self.work_dir_name = a

if self.skip_bigger and not self.media_magic:
raise AssertionError("The skip_bigger works only with media_magic")

if self.invert_selection and any((self.replace_with_original, self.treat_bigger_as_original, self.set_both_to_older_date)):
raise AssertionError(
"It does not make sense using invert_selection with this command. The work file has no file to compare to.")

match self.tolerate_hour:
case True:
self.tolerate_hour = -1, 1
Expand All @@ -295,7 +315,8 @@ def check(self):
self.checksum and ("crc32", "") or ("", "crc32")))
print(f"Find files by {used}{f', ignoring: {ignored}' if ignored else ''}")

which = f"either the file from the work dir at '{self.work_dir_name}' or the original dir at '{self.original_dir_name}' (whichever is bigger)" \
dirs_ = "" if self.same_superdir else f" at '{self.work_dir_name}' or the original dir at '{self.original_dir_name}'"
which = f"either the file from the work dir{dirs_} (whichever is bigger)" \
if self.treat_bigger_as_original \
else f"duplicates from the work dir at '{self.work_dir_name}'"
small = " (only if smaller than the pair file)" if self.skip_bigger else ""
Expand All @@ -321,7 +342,8 @@ def check(self):

def _loop_files(self):
work_dir, skip = self.work_dir, self.skip
work_files = [f for f in tqdm(Path(work_dir).rglob("*"), desc="Caching working files")]
work_files = [f for f in tqdm((p for p in Path(work_dir).rglob(
"*") if not p.is_dir()), desc="Caching working files")]
if skip:
if isinstance(work_files, list):
work_files = work_files[skip:]
Expand Down Expand Up @@ -398,8 +420,10 @@ def _process_file(self, work_file: Path, bar: tqdm):

# original of the work_file has been found
# one of them might be treated as a duplicate and thus affected
if original:
if original and not self.invert_selection:
self._affect(work_file, original)
elif not original and self.invert_selection:
self._affect(work_file, Path("/dev/null"))
elif len(candidates) > 1: # we did not find the object amongst multiple candidates
self.having_multiple_candidates[work_file] = candidates
logger.debug("Candidates %s %s", work_file, candidates)
Expand Down Expand Up @@ -467,6 +491,9 @@ def _affect(self, work_file: Path, original: Path):
if (warning and self.log_level <= logging.WARNING) or (self.log_level <= logging.INFO):
self.bar.clear() # this looks the same from jupyter and much better from terminal (does not leave a trace of abandoned bars)
self._print_change(change)
if self._output:
with redirect_stdout(self._output):
self._print_change(change)

def _rename(self, change: Change, affected_file: Path):
msg = "renamable"
Expand Down Expand Up @@ -616,10 +643,24 @@ def print_changes(self):
[self._print_change(change) for change in self.changes]

def _print_change(self, change: Change):
""" We aim for the clearest representation to help the user orientate at a glance.
Because file paths can be long, we'll display them as succinctly as possible.
Sometimes we'll use, for example, the disk name, other times we'll use file names,
or the first or last differing part of the path. """
wicon, oicon = "🔨", "📄"
wf, of = change

# Nice paths
wn, on = self.work_dir_name, self.original_dir_name # meaningful dir representation
if self.same_superdir:
if wf.name == of.name: # full path that makes the difference
len_ = len(os.path.commonprefix((wf, of)))
wn, on = str(wf.parent)[len_:] or "(basedir)", str(of.parent)[len_:] or "(basedir)"
else: # the file name will make the meaningful difference
wn, on = wf.name, of.name

print("*", wf)
print(" ", of)
[print(text, *(str(s) for s in changes))
for text, changes in zip((f" {wicon}{self.work_dir_name}:",
f" {oicon}{self.original_dir_name}:"), change.values()) if len(changes)]
for text, changes in zip((f" {wicon}{wn}:",
f" {oicon}{on}:"), change.values()) if len(changes)]
12 changes: 12 additions & 0 deletions deduplidog/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,18 @@ def _qp(path: Path):
s = str(path)
return f'"{s}"' if " " in s else s

def open_log_file(name): # undocumented functions
log_file_path = Path(f"{name}.log")
try:
return log_file_path.open("x")
except FileExistsError:
counter = 1
while True:
new_file_path = Path(f"{name} ({counter}).log")
try:
return new_file_path.open("x")
except FileExistsError:
counter += 1

def images(urls: Iterable[str | Path]):
""" Display a ribbon of images. """
Expand Down
36 changes: 28 additions & 8 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,15 @@ def check(self, prefixed: tuple[int] = None, suck: tuple[int] = None):
class TestDeduplidog(TestCase):

def prepare(self, testing_dir: str = None):
self.temp = TemporaryDirectory()
temp = Path(testing_dir) if testing_dir else self.temp.name
self.temp = mkdtemp() # TemporaryDirectory() TODO
# temp = Path(testing_dir) if testing_dir else self.temp.name TODO
temp = str(self.temp)
originals = Path(temp, "originals")
work_dir = Path(temp, "work_dir")
if not testing_dir:
originals.mkdir()
work_dir.mkdir()

# c = FileRepresentationController(temp)
original_files = {name: FileRepresentation(originals / name).write()
for name in (f"file_{i}" for i in range(12))}
work_files = {name: FileRepresentation(work_dir / name, *rest).write() for name, *rest in (
Expand All @@ -107,27 +107,47 @@ def test_simple_prefix(self):

def test_date(self):
state = self.prepare()
Deduplidog(*state, rename=True, execute=True, ignore_date=True)
Deduplidog(*state, rename=True, execute=True, ignore_date=True, neglect_warning=True)
state.check(prefixed=(4, 5, 6, 7, 8, 9, 10, 11))
state = self.prepare()
Deduplidog(*state, rename=True, execute=True, ignore_date=True)
state.check(prefixed=(4, 5, 6, 7, 11))

state = self.prepare()
Deduplidog(*state, rename=True, execute=True, tolerate_hour=1)
Deduplidog(*state, rename=True, execute=True, tolerate_hour=1, neglect_warning=True)
state.check(prefixed=(4, 7, 8, 9, 11))
state = self.prepare()
Deduplidog(*state, rename=True, execute=True, tolerate_hour=1)
state.check(prefixed=(4, 7, 11))

state = self.prepare()
Deduplidog(*state, rename=True, execute=True, tolerate_hour=2)
Deduplidog(*state, rename=True, execute=True, tolerate_hour=2, neglect_warning=True)
state.check(prefixed=(4, 5, 6, 7, 8, 9, 11))
state = self.prepare()
Deduplidog(*state, rename=True, execute=True, tolerate_hour=2)
state.check(prefixed=(4, 5, 6, 7, 11))

def test_replace_with_original(self):
state = self.prepare()
Deduplidog(*state, replace_with_original=True, execute=True)
Deduplidog(*state, replace_with_original=True, execute=True, neglect_warning=True)
state.work_files["file_11"].suck(state.originals["file_11"])
state.check()

state = self.prepare()
Deduplidog(*state, replace_with_original=True, execute=True, tolerate_hour=2)
Deduplidog(*state, replace_with_original=True, execute=True, tolerate_hour=2, neglect_warning=True)
state.check(suck=(4, 5, 6, 7, 8, 9, 11))

def test_invert_selection(self):
state = self.prepare()
self.assertRaises(AssertionError, Deduplidog,
*state, replace_with_original=True, execute=True, tolerate_hour=2, invert_selection=True)
Deduplidog(*state, rename=True, execute=True, tolerate_hour=2, neglect_warning=True, invert_selection=False)
state.check(prefixed=(4, 5, 6, 7, 8, 9, 11))

state = self.prepare()
Deduplidog(*state, rename=True, execute=True, tolerate_hour=2, neglect_warning=True, invert_selection=True)
state.check(prefixed=(1, 2, 10))

# No media file in the test case.
# def test_skip_bigger(self):
# state = self.prepare()
Expand Down

0 comments on commit e0620f9

Please sign in to comment.