From 670de4558497d173abc4dcde2070963fd83e5169 Mon Sep 17 00:00:00 2001 From: Basti Tee Date: Mon, 21 Aug 2023 17:09:03 +0200 Subject: [PATCH] Improve quote handling for highlights with notes --- CHANGELOG.md | 1 + tests/test_tolino_note.py | 2 +- tolino_notes/tolino_note.py | 29 +++++++++++++++++------------ 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 49f84e1..98b4dff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## 0.2.0 +- Improve quote handling of highlights with notes - Extracted notes writer to separate module - Improved type-safety - Improve VSCode development environment diff --git a/tests/test_tolino_note.py b/tests/test_tolino_note.py index 9af17ce..0ff5be7 100644 --- a/tests/test_tolino_note.py +++ b/tests/test_tolino_note.py @@ -140,7 +140,7 @@ def test_lang_de_noted_note_2(self) -> None: # noqa: D102 assert note.content.endswith('Alas."') assert note.user_notes assert note.user_notes.startswith('Let\'s make a long multi') - assert note.user_notes.endswith('quotes like "this') + assert note.user_notes.endswith('quotes like "this"') def test_lang_de_noted_note_3(self) -> None: # noqa: D102 note = TolinoNote.from_unparsed_content( diff --git a/tolino_notes/tolino_note.py b/tolino_notes/tolino_note.py index ad48dd8..598c561 100644 --- a/tolino_notes/tolino_note.py +++ b/tolino_notes/tolino_note.py @@ -69,11 +69,15 @@ def __get_language(hint: str) -> Optional[Tuple[dict, str]]: return None @staticmethod - def __clean_string(string: str) -> str: + def __clean_string(string: str, strip_trail_lead_quotes: bool = True) -> str: string = string.strip() + if strip_trail_lead_quotes: + for patt_repl in [ + (r'"$', ''), # Trailing quotes + (r'^"', ''), # Leading quotes + ]: + string = re.sub(patt_repl[0], patt_repl[1], string) for patt_repl in [ - (r'\s*"\s*$', ''), # Trailing quotes - (r'^\s*"\s*', ''), # Leading quotes (r'[\u2018\u2019\u00b4`]', '\''), # Special ticks ’‘´` (r'[“”«»]+', '"'), # Unwanted quote types (r'\'{2}', '"'), # Double-quotes made of single-quotes '' @@ -81,7 +85,7 @@ def __clean_string(string: str) -> str: (r'…', '...'), # Special dashes ]: string = re.sub(patt_repl[0], patt_repl[1], string) - return string + return string.strip() @staticmethod def from_unparsed_content(unparsed_content: str) -> Optional['TolinoNote']: @@ -141,14 +145,13 @@ def from_unparsed_content(unparsed_content: str) -> Optional['TolinoNote']: ) elif re.match(lang_dict['highlight_prefix'] + r'.*', prefix): # For highlights the entire content is what the user highlighted - content = TolinoNote.__clean_string( - ' '.join( - [ - re.sub(r'\s', ' ', li.strip()).strip() - for li in full_text_split[1:] - ] - ) + content = ' '.join( + [ + re.sub(r'\s', ' ', li.strip()).strip() + for li in full_text_split[1:] + ] ) + content = TolinoNote.__clean_string(content) return TolinoNote( NoteType.HIGHLIGHT, lang_id[1], @@ -165,7 +168,9 @@ def from_unparsed_content(unparsed_content: str) -> Optional['TolinoNote']: # Best guess: Begin of the book highlight is the last quote # preceeded by a line break. ¯\_(ツ)_/¯ user_notes = r'\n"'.join(fts.split('\n"')[:-1]) - user_notes = TolinoNote.__clean_string(re.sub(r'\s', ' ', user_notes)) + user_notes = TolinoNote.__clean_string( + re.sub(r'\s', ' ', user_notes), False + ) # Before that is what the user wrote highlight = r'\n"'.join(fts.split('\n"')[-1:]) highlight = TolinoNote.__clean_string(re.sub(r'\s', ' ', highlight))