Skip to content

Commit

Permalink
ebook: fix ellipsis spacing (#166)
Browse files Browse the repository at this point in the history
  • Loading branch information
entorb authored Apr 27, 2024
1 parent a5533c2 commit a1b125c
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 43 deletions.
54 changes: 12 additions & 42 deletions scripts/check_chapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def fix_line(s: str) -> str:
# simple and safe
s = fix_spaces(s)
s = fix_latex(s)
s = fix_dots(s)
s = fix_ellipsis(s)
s = fix_MrMrs(s)
s = fix_numbers(s)
s = fix_common_typos(s)
Expand Down Expand Up @@ -214,53 +214,23 @@ def fix_latex(s: str) -> str:
assert fix_latex("no new line after \\\\") == "no new line after \\\\"


def fix_dots(s: str) -> str:
def fix_ellipsis(s: str) -> str:
"""Fix spaces around ellipsis."""
# ... -> …
s = s.replace("...", "…")
# ... with spaces around
s = s.replace(" … ", "…")
# NOT '… ' as in ', no… “I'
# s = re.sub(r" *… *", r"…", s)
# … at start of line
s = re.sub(r"^ *… *", r"…", s)
# … at end of line
s = re.sub(r" *… *$", r"…", s)
# before comma
s = s.replace(" …,", "…,")

if settings["lang"] == "EN":
# … at end of quotation ' …"' -> '…"'
s = s.replace(" …”", "…”")
# "… " but not before “
s = re.sub(r"… (?!“)", r"…", s)
# " …" but after punctuation
s = re.sub(r"(?<!(%|,|\.|!|\?|:)) …", r"…", s) # … at start of line
if settings["lang"] == "DE":
# … at end of quotation ' …"' -> '…"'
s = s.replace(" …“", "…“")
# "… " but not before „
s = re.sub(r"… (?!„)", r"…", s)
# " …" but after punctuation
s = re.sub(r"(?<!(%|,|\.|!|\?|:)) …", r"…", s)
# keep space between . or , and …
s = re.sub(r"([,\.!\?])…", r"\1 …", s)
# remove all spaces around ellipsis
s = re.sub(r" *… *", r"…", s)

# after punctuation: add space
s = re.sub(r"(?<=[\.\?!:,;])…", r" …", s)
return s


assert fix_dots("bad...dots") == "bad…dots"
assert fix_dots("bad … dots") == "bad…dots"
assert fix_dots("bad.…dots") == "bad. …dots"
assert fix_dots("bad. …dots") == "bad. …dots"
assert fix_dots("bad, …dots") == "bad, …dots"
assert fix_dots("bad! …dots") == "bad! …dots"
assert fix_dots("bad? …dots") == "bad? …dots"
assert fix_dots(" … dots") == "…dots"
assert fix_dots("some … ") == "some…"

if settings["lang"] == "DE":
assert fix_dots("bad… dots") == "bad…dots"
assert fix_dots("bad… „dots") == "bad… „dots"
assert fix_ellipsis("foo...bar") == "foo…bar"
assert fix_ellipsis("foo … bar") == "foo…bar"
assert fix_ellipsis("foo… bar") == "foo…bar"
assert fix_ellipsis("foo …bar") == "foo…bar"
assert fix_ellipsis("foo, …") == "foo, …"


def fix_MrMrs(s: str) -> str: # noqa: N802
Expand Down
26 changes: 25 additions & 1 deletion scripts/ebook/step_6.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,28 @@
target_file = Path("hpmor.html")


def fix_ellipsis(s: str) -> str:
"""
Fix ellipsis spacing for ebooks.
"""
# 1. remove all spaces around ellipsis
s = re.sub(r" *… *", "…", s)
# 2. recreate some spaces
# before punctuation : no space, so governed by 1.
# between words
s = re.sub(r"(?<=[\w])…(?=[\w])", "… ", s)
# after punctuation: add space
s = re.sub(r"(?<=[\.\?!:,;])…", r" …", s)
# fine-tuning </em>… and …<em>
s = re.sub(r"(?<=</em>)…", "… ", s)
s = re.sub(r"…(?=<em>)", "… ", s)
# before opening EN-quotes: add space
s = re.sub(r"…(?=[“])", "… ", s)
# before opening DE-quotes: add space
# s = re.sub(r"…(?=[„])", "… ", s)
return s


if __name__ == "__main__":
print("=== 6. HTML modifications ===")

Expand Down Expand Up @@ -61,13 +83,15 @@
# remove training slashes to satisfy https://validator.w3.org
cont = cont.replace("<br />", "<br>")
cont = cont.replace("<hr />", "<hr>")

cont = re.sub(
r"(<meta [^>]*) />",
r"\1>",
cont,
)

# fix spaces around ellipsis
cont = fix_ellipsis(cont)

# remove bad span ids (containing spaces) from newspaper spans
cont = re.sub(r'<span id="[^"]+" label="[^"]+">', r"<span>", cont, count=5)

Expand Down
32 changes: 32 additions & 0 deletions scripts/ebook/step_6_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Unit Tests.""" # noqa: INP001
# ruff: noqa: S101

from step_6 import fix_ellipsis

test_cases = [
# quotations
("foo…”", "foo…”"),
("“…foo", "“…foo"),
# html
("foo…</p>", "foo…</p>"),
("<p>…foo", "<p>…foo"),
# between 2 words
("foo…bar", "foo… bar"),
("foo …bar", "foo… bar"),
("foo … bar", "foo… bar"),
("foo… bar", "foo… bar"),
# start of sentence
("foo.…bar", "foo. …bar"),
("foo!…bar", "foo! …bar"),
("foo?…bar", "foo? …bar"),
# end of sentence
("foo…. bar", "foo…. bar"),
("foo…! bar", "foo…! bar"),
("foo…? bar", "foo…? bar"),
# emph
("foo</em>…bar", "foo</em>… bar"),
("foo…<em>bar", "foo… <em>bar"),
]

for inp, exp in test_cases:
assert fix_ellipsis(inp) == exp, fix_ellipsis(inp)

0 comments on commit a1b125c

Please sign in to comment.