From a1b125cc02e816d0ffaf8e55efe8d1dc8f8f4407 Mon Sep 17 00:00:00 2001 From: Torben <59419684+entorb@users.noreply.github.com> Date: Sat, 27 Apr 2024 14:54:47 +0200 Subject: [PATCH] ebook: fix ellipsis spacing (#166) --- scripts/check_chapters.py | 54 ++++++++---------------------------- scripts/ebook/step_6.py | 26 ++++++++++++++++- scripts/ebook/step_6_test.py | 32 +++++++++++++++++++++ 3 files changed, 69 insertions(+), 43 deletions(-) create mode 100644 scripts/ebook/step_6_test.py diff --git a/scripts/check_chapters.py b/scripts/check_chapters.py index f536a63a..791660a5 100755 --- a/scripts/check_chapters.py +++ b/scripts/check_chapters.py @@ -140,7 +140,7 @@ def fix_line(s: str) -> str: # simple and safe s = fix_spaces(s) s = fix_latex(s) - s = fix_dots(s) + s = fix_ellipsis(s) s = fix_MrMrs(s) s = fix_numbers(s) s = fix_common_typos(s) @@ -214,53 +214,23 @@ def fix_latex(s: str) -> str: assert fix_latex("no new line after \\\\") == "no new line after \\\\" -def fix_dots(s: str) -> str: +def fix_ellipsis(s: str) -> str: + """Fix spaces around ellipsis.""" # ... -> … s = s.replace("...", "…") - # ... with spaces around - s = s.replace(" … ", "…") - # NOT '… ' as in ', no… “I' - # s = re.sub(r" *… *", r"…", s) - # … at start of line - s = re.sub(r"^ *… *", r"…", s) - # … at end of line - s = re.sub(r" *… *$", r"…", s) - # before comma - s = s.replace(" …,", "…,") - - if settings["lang"] == "EN": - # … at end of quotation ' …"' -> '…"' - s = s.replace(" …”", "…”") - # "… " but not before “ - s = re.sub(r"… (?!“)", r"…", s) - # " …" but after punctuation - s = re.sub(r"(? '…"' - s = s.replace(" …“", "…“") - # "… " but not before „ - s = re.sub(r"… (?!„)", r"…", s) - # " …" but after punctuation - s = re.sub(r"(? str: # noqa: N802 diff --git a/scripts/ebook/step_6.py b/scripts/ebook/step_6.py index 6f1edcae..7a6b06d6 100755 --- a/scripts/ebook/step_6.py +++ b/scripts/ebook/step_6.py @@ -16,6 +16,28 @@ target_file = Path("hpmor.html") +def fix_ellipsis(s: str) -> str: + """ + Fix ellipsis spacing for ebooks. + """ + # 1. remove all spaces around ellipsis + s = re.sub(r" *… *", "…", s) + # 2. recreate some spaces + # before punctuation : no space, so governed by 1. + # between words + s = re.sub(r"(?<=[\w])…(?=[\w])", "… ", s) + # after punctuation: add space + s = re.sub(r"(?<=[\.\?!:,;])…", r" …", s) + # fine-tuning … and … + s = re.sub(r"(?<=)…", "… ", s) + s = re.sub(r"…(?=)", "… ", s) + # before opening EN-quotes: add space + s = re.sub(r"…(?=[“])", "… ", s) + # before opening DE-quotes: add space + # s = re.sub(r"…(?=[„])", "… ", s) + return s + + if __name__ == "__main__": print("=== 6. HTML modifications ===") @@ -61,13 +83,15 @@ # remove training slashes to satisfy https://validator.w3.org cont = cont.replace("
", "
") cont = cont.replace("
", "
") - cont = re.sub( r"(]*) />", r"\1>", cont, ) + # fix spaces around ellipsis + cont = fix_ellipsis(cont) + # remove bad span ids (containing spaces) from newspaper spans cont = re.sub(r'', r"", cont, count=5) diff --git a/scripts/ebook/step_6_test.py b/scripts/ebook/step_6_test.py new file mode 100644 index 00000000..19f2adc5 --- /dev/null +++ b/scripts/ebook/step_6_test.py @@ -0,0 +1,32 @@ +"""Unit Tests.""" # noqa: INP001 +# ruff: noqa: S101 + +from step_6 import fix_ellipsis + +test_cases = [ + # quotations + ("foo…”", "foo…”"), + ("“…foo", "“…foo"), + # html + ("foo…

", "foo…

"), + ("

…foo", "

…foo"), + # between 2 words + ("foo…bar", "foo… bar"), + ("foo …bar", "foo… bar"), + ("foo … bar", "foo… bar"), + ("foo… bar", "foo… bar"), + # start of sentence + ("foo.…bar", "foo. …bar"), + ("foo!…bar", "foo! …bar"), + ("foo?…bar", "foo? …bar"), + # end of sentence + ("foo…. bar", "foo…. bar"), + ("foo…! bar", "foo…! bar"), + ("foo…? bar", "foo…? bar"), + # emph + ("foo…bar", "foo… bar"), + ("foo…bar", "foo… bar"), +] + +for inp, exp in test_cases: + assert fix_ellipsis(inp) == exp, fix_ellipsis(inp)