From a1b125cc02e816d0ffaf8e55efe8d1dc8f8f4407 Mon Sep 17 00:00:00 2001
From: Torben <59419684+entorb@users.noreply.github.com>
Date: Sat, 27 Apr 2024 14:54:47 +0200
Subject: [PATCH] ebook: fix ellipsis spacing (#166)
---
scripts/check_chapters.py | 54 ++++++++----------------------------
scripts/ebook/step_6.py | 26 ++++++++++++++++-
scripts/ebook/step_6_test.py | 32 +++++++++++++++++++++
3 files changed, 69 insertions(+), 43 deletions(-)
create mode 100644 scripts/ebook/step_6_test.py
diff --git a/scripts/check_chapters.py b/scripts/check_chapters.py
index f536a63a..791660a5 100755
--- a/scripts/check_chapters.py
+++ b/scripts/check_chapters.py
@@ -140,7 +140,7 @@ def fix_line(s: str) -> str:
# simple and safe
s = fix_spaces(s)
s = fix_latex(s)
- s = fix_dots(s)
+ s = fix_ellipsis(s)
s = fix_MrMrs(s)
s = fix_numbers(s)
s = fix_common_typos(s)
@@ -214,53 +214,23 @@ def fix_latex(s: str) -> str:
assert fix_latex("no new line after \\\\") == "no new line after \\\\"
-def fix_dots(s: str) -> str:
+def fix_ellipsis(s: str) -> str:
+ """Fix spaces around ellipsis."""
# ... -> …
s = s.replace("...", "…")
- # ... with spaces around
- s = s.replace(" … ", "…")
- # NOT '… ' as in ', no… “I'
- # s = re.sub(r" *… *", r"…", s)
- # … at start of line
- s = re.sub(r"^ *… *", r"…", s)
- # … at end of line
- s = re.sub(r" *… *$", r"…", s)
- # before comma
- s = s.replace(" …,", "…,")
-
- if settings["lang"] == "EN":
- # … at end of quotation ' …"' -> '…"'
- s = s.replace(" …”", "…”")
- # "… " but not before “
- s = re.sub(r"… (?!“)", r"…", s)
- # " …" but after punctuation
- s = re.sub(r"(? '…"'
- s = s.replace(" …“", "…“")
- # "… " but not before „
- s = re.sub(r"… (?!„)", r"…", s)
- # " …" but after punctuation
- s = re.sub(r"(? str: # noqa: N802
diff --git a/scripts/ebook/step_6.py b/scripts/ebook/step_6.py
index 6f1edcae..7a6b06d6 100755
--- a/scripts/ebook/step_6.py
+++ b/scripts/ebook/step_6.py
@@ -16,6 +16,28 @@
target_file = Path("hpmor.html")
+def fix_ellipsis(s: str) -> str:
+ """
+ Fix ellipsis spacing for ebooks.
+ """
+ # 1. remove all spaces around ellipsis
+ s = re.sub(r" *… *", "…", s)
+ # 2. recreate some spaces
+ # before punctuation : no space, so governed by 1.
+ # between words
+ s = re.sub(r"(?<=[\w])…(?=[\w])", "… ", s)
+ # after punctuation: add space
+ s = re.sub(r"(?<=[\.\?!:,;])…", r" …", s)
+ # fine-tuning … and …
+ s = re.sub(r"(?<=)…", "… ", s)
+ s = re.sub(r"…(?=)", "… ", s)
+ # before opening EN-quotes: add space
+ s = re.sub(r"…(?=[“])", "… ", s)
+ # before opening DE-quotes: add space
+ # s = re.sub(r"…(?=[„])", "… ", s)
+ return s
+
+
if __name__ == "__main__":
print("=== 6. HTML modifications ===")
@@ -61,13 +83,15 @@
# remove training slashes to satisfy https://validator.w3.org
cont = cont.replace("
", "
")
cont = cont.replace("
", "
")
-
cont = re.sub(
r"(]*) />",
r"\1>",
cont,
)
+ # fix spaces around ellipsis
+ cont = fix_ellipsis(cont)
+
# remove bad span ids (containing spaces) from newspaper spans
cont = re.sub(r'', r"", cont, count=5)
diff --git a/scripts/ebook/step_6_test.py b/scripts/ebook/step_6_test.py
new file mode 100644
index 00000000..19f2adc5
--- /dev/null
+++ b/scripts/ebook/step_6_test.py
@@ -0,0 +1,32 @@
+"""Unit Tests.""" # noqa: INP001
+# ruff: noqa: S101
+
+from step_6 import fix_ellipsis
+
+test_cases = [
+ # quotations
+ ("foo…”", "foo…”"),
+ ("“…foo", "“…foo"),
+ # html
+ ("foo…
…foo", "
…foo"), + # between 2 words + ("foo…bar", "foo… bar"), + ("foo …bar", "foo… bar"), + ("foo … bar", "foo… bar"), + ("foo… bar", "foo… bar"), + # start of sentence + ("foo.…bar", "foo. …bar"), + ("foo!…bar", "foo! …bar"), + ("foo?…bar", "foo? …bar"), + # end of sentence + ("foo…. bar", "foo…. bar"), + ("foo…! bar", "foo…! bar"), + ("foo…? bar", "foo…? bar"), + # emph + ("foo…bar", "foo… bar"), + ("foo…bar", "foo… bar"), +] + +for inp, exp in test_cases: + assert fix_ellipsis(inp) == exp, fix_ellipsis(inp)