From a1b125cc02e816d0ffaf8e55efe8d1dc8f8f4407 Mon Sep 17 00:00:00 2001
From: Torben <59419684+entorb@users.noreply.github.com>
Date: Sat, 27 Apr 2024 14:54:47 +0200
Subject: [PATCH] ebook: fix ellipsis spacing (#166)

---
 scripts/check_chapters.py    | 54 ++++++++----------------------------
 scripts/ebook/step_6.py      | 26 ++++++++++++++++-
 scripts/ebook/step_6_test.py | 32 +++++++++++++++++++++
 3 files changed, 69 insertions(+), 43 deletions(-)
 create mode 100644 scripts/ebook/step_6_test.py
diff --git a/scripts/check_chapters.py b/scripts/check_chapters.py
index f536a63a..791660a5 100755
--- a/scripts/check_chapters.py
+++ b/scripts/check_chapters.py
@@ -140,7 +140,7 @@ def fix_line(s: str) -> str:
     # simple and safe
     s = fix_spaces(s)
     s = fix_latex(s)
-    s = fix_dots(s)
+    s = fix_ellipsis(s)
     s = fix_MrMrs(s)
     s = fix_numbers(s)
     s = fix_common_typos(s)
@@ -214,53 +214,23 @@ def fix_latex(s: str) -> str:
 assert fix_latex("no new line after \\\\") == "no new line after \\\\"
 
 
-def fix_dots(s: str) -> str:
+def fix_ellipsis(s: str) -> str:
+    """Fix spaces around ellipsis."""
     # ... -> …
     s = s.replace("...", "…")
-    # ... with spaces around
-    s = s.replace(" … ", "…")
-    # NOT '… ' as in ', no… “I'
-    # s = re.sub(r" *… *", r"…", s)
-    # … at start of line
-    s = re.sub(r"^ *… *", r"…", s)
-    # … at end of line
-    s = re.sub(r" *… *$", r"…", s)
-    # before comma
-    s = s.replace(" …,", "…,")
-
-    if settings["lang"] == "EN":
-        # … at end of quotation ' …"' -> '…"'
-        s = s.replace(" …”", "…”")
-        # "… " but not before “
-        s = re.sub(r"… (?!“)", r"…", s)
-        # " …" but after punctuation
-        s = re.sub(r"(?<!(%|,|\.|!|\?|:)) …", r"…", s)  # … at start of line
-    if settings["lang"] == "DE":
-        # … at end of quotation ' …"' -> '…"'
-        s = s.replace(" …“", "…“")
-        # "… " but not before „
-        s = re.sub(r"… (?!„)", r"…", s)
-        # " …" but after punctuation
-        s = re.sub(r"(?<!(%|,|\.|!|\?|:)) …", r"…", s)
-    #  keep space between . or , and …
-    s = re.sub(r"([,\.!\?])…", r"\1 …", s)
+    # remove all spaces around ellipsis
+    s = re.sub(r" *… *", r"…", s)
 
+    # after punctuation: add space
+    s = re.sub(r"(?<=[\.\?!:,;])…", r" …", s)
     return s
 
 
-assert fix_dots("bad...dots") == "bad…dots"
-assert fix_dots("bad … dots") == "bad…dots"
-assert fix_dots("bad.…dots") == "bad. …dots"
-assert fix_dots("bad. …dots") == "bad. …dots"
-assert fix_dots("bad, …dots") == "bad, …dots"
-assert fix_dots("bad! …dots") == "bad! …dots"
-assert fix_dots("bad? …dots") == "bad? …dots"
-assert fix_dots(" … dots") == "…dots"
-assert fix_dots("some … ") == "some…"
-
-if settings["lang"] == "DE":
-    assert fix_dots("bad… dots") == "bad…dots"
-    assert fix_dots("bad… „dots") == "bad… „dots"
+assert fix_ellipsis("foo...bar") == "foo…bar"
+assert fix_ellipsis("foo … bar") == "foo…bar"
+assert fix_ellipsis("foo… bar") == "foo…bar"
+assert fix_ellipsis("foo …bar") == "foo…bar"
+assert fix_ellipsis("foo, …") == "foo, …"
 
 
 def fix_MrMrs(s: str) -> str:  # noqa: N802
diff --git a/scripts/ebook/step_6.py b/scripts/ebook/step_6.py
index 6f1edcae..7a6b06d6 100755
--- a/scripts/ebook/step_6.py
+++ b/scripts/ebook/step_6.py
@@ -16,6 +16,28 @@
 target_file = Path("hpmor.html")
 
 
+def fix_ellipsis(s: str) -> str:
+    """
+    Fix ellipsis spacing for ebooks.
+    """
+    # 1. remove all spaces around ellipsis
+    s = re.sub(r" *… *", "…", s)
+    # 2. recreate some spaces
+    # before punctuation : no space, so governed by 1.
+    # between words
+    s = re.sub(r"(?<=[\w])…(?=[\w])", "… ", s)
+    # after punctuation: add space
+    s = re.sub(r"(?<=[\.\?!:,;])…", r" …", s)
+    # fine-tuning </em>… and …<em>
+    s = re.sub(r"(?<=</em>)…", "… ", s)
+    s = re.sub(r"…(?=<em>)", "… ", s)
+    # before opening EN-quotes: add space
+    s = re.sub(r"…(?=[“])", "… ", s)
+    # before opening DE-quotes: add space
+    # s = re.sub(r"…(?=[„])", "… ", s)
+    return s
+
+
 if __name__ == "__main__":
     print("=== 6. HTML modifications ===")
 
@@ -61,13 +83,15 @@
     # remove training slashes to satisfy https://validator.w3.org
     cont = cont.replace("<br />", "<br>")
     cont = cont.replace("<hr />", "<hr>")
-
     cont = re.sub(
         r"(<meta [^>]*) />",
         r"\1>",
         cont,
     )
 
+    # fix spaces around ellipsis
+    cont = fix_ellipsis(cont)
+
     # remove bad span ids (containing spaces) from newspaper spans
     cont = re.sub(r'<span id="[^"]+" label="[^"]+">', r"<span>", cont, count=5)
 
diff --git a/scripts/ebook/step_6_test.py b/scripts/ebook/step_6_test.py
new file mode 100644
index 00000000..19f2adc5
--- /dev/null
+++ b/scripts/ebook/step_6_test.py
@@ -0,0 +1,32 @@
+"""Unit Tests."""  # noqa: INP001
+# ruff: noqa: S101
+
+from step_6 import fix_ellipsis
+
+test_cases = [
+    # quotations
+    ("foo…”", "foo…”"),
+    ("“…foo", "“…foo"),
+    # html
+    ("foo…</p>", "foo…</p>"),
+    ("<p>…foo", "<p>…foo"),
+    # between 2 words
+    ("foo…bar", "foo… bar"),
+    ("foo …bar", "foo… bar"),
+    ("foo … bar", "foo… bar"),
+    ("foo… bar", "foo… bar"),
+    # start of sentence
+    ("foo.…bar", "foo. …bar"),
+    ("foo!…bar", "foo! …bar"),
+    ("foo?…bar", "foo? …bar"),
+    # end of sentence
+    ("foo…. bar", "foo…. bar"),
+    ("foo…! bar", "foo…! bar"),
+    ("foo…? bar", "foo…? bar"),
+    # emph
+    ("foo</em>…bar", "foo</em>… bar"),
+    ("foo…<em>bar", "foo… <em>bar"),
+]
+
+for inp, exp in test_cases:
+    assert fix_ellipsis(inp) == exp, fix_ellipsis(inp)