diff --git a/pie_extended/models/__init__.py b/pie_extended/models/__init__.py index 609edd5..6937cb9 100644 --- a/pie_extended/models/__init__.py +++ b/pie_extended/models/__init__.py @@ -4,5 +4,6 @@ "fr", "freem", "grc", - "dum" + "dum", + "occ_cont" ] diff --git a/pie_extended/models/occ_cont/__init__.py b/pie_extended/models/occ_cont/__init__.py new file mode 100644 index 0000000..ebcc8e8 --- /dev/null +++ b/pie_extended/models/occ_cont/__init__.py @@ -0,0 +1,25 @@ +from ...utils import Metadata, File, get_path + + +DESC = Metadata( + "OccitanContemporain", + "occ_cont", + ["Oriane Nédey", "Juliette Janès"], + "Model trained on ...", + "https://github.com/DEFI-COLaF/modeles-papie" +) + +VERSION = "v0.0.1" +DOWNLOADS = [ + File("https://github.com/DEFI-COLaF/modeles-papie/releases/download/" + VERSION + + "/PaPie_Lemma_finetune-WIKI2TTB-v0.0.1.tar", + "lemma.tar"), + File("https://github.com/DEFI-COLaF/modeles-papie/releases/download/" + VERSION + + "/PaPie_POS_WIKITTB-v0.0.1.tar", + "pos.tar"), +] + +Models = "".join([ + "<{},lemma>".format(get_path("occ_cont", "lemma.tar")), + "<{},pos>".format(get_path("occ_cont", "pos.tar")) +]) diff --git a/pie_extended/models/occ_cont/imports.py b/pie_extended/models/occ_cont/imports.py new file mode 100644 index 0000000..7b45045 --- /dev/null +++ b/pie_extended/models/occ_cont/imports.py @@ -0,0 +1,23 @@ +from pie_extended.pipeline.iterators.proto import DataIterator, GenericExcludePatterns +from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor +from pie_extended.models.occ_cont.tokenizer import OccMemorizingTokenizer +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype + + +def get_iterator_and_processor(max_tokens=256): + tokenizer = OccMemorizingTokenizer() + processor = MemoryzingProcessor( + tokenizer_memory=tokenizer, + head_processor=ProcessorPrototype() + ) + iterator = DataIterator( + tokenizer=tokenizer, + max_tokens=max_tokens, + exclude_patterns=[ + excl.exclude_regexp + for excl in tokenizer.normalizers + if excl.exclude_regexp + ] + ) + return iterator, processor + diff --git a/pie_extended/models/occ_cont/processor.py b/pie_extended/models/occ_cont/processor.py new file mode 100644 index 0000000..e69de29 diff --git a/pie_extended/models/occ_cont/tokenizer.py b/pie_extended/models/occ_cont/tokenizer.py new file mode 100644 index 0000000..2e80caf --- /dev/null +++ b/pie_extended/models/occ_cont/tokenizer.py @@ -0,0 +1,127 @@ +import regex as re +from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer +from typing import List, Generator, Tuple +import unicodedata +from pie_extended.pipeline.tokenizers.utils.excluder import ( + ReferenceExcluder, + DEFAULT_CHAR_REGISTRY, + ApostropheExcluder, + chars +) + +_Dots_except_apostrophe = r".?!\"“”\"«»…\[\]\(\)„“" +_SpaceNormalizer = re.compile(r"(\s+)") +_APO = chars.APOSTROPHE + + +class OccMemorizingTokenizer(MemorizingTokenizer): + """ Occitan Tokenizer with memorizing capacities (for normalization steps) + + This tokenizer is based on a Perl script published by Marianne Verges-Couret in 2019 + (https://zenodo.org/records/2533873), as well as the description for the Python tokenizer described in + (Miletić, 2023) that was also derived from the work in project RESTAURE. + It was adapted by Oriane Nedey in python and then adapted to Pie-Extended + """ + _sentence_boundaries = re.compile( + r"([" + _Dots_except_apostrophe + r"]+\s*)+" + ) + re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)") + + # Define a pattern that matches any punctuation or symbol, with exceptions + re_in_non_amb = re.compile(rf"(?![{_APO}\-,.<>])"+r"[\p{P}\p{S}]") + + # Define a pattern that matches (XML/HTML...) tags # ToDO check that this change is ok + re_tags = re.compile(r'(<\\?[^\d\s].*>)') + + re_split_match = re.compile(rf"(\.{2,})|({re_in_non_amb.pattern})|{re_tags.pattern}") + + def __init__(self): + super(OccMemorizingTokenizer, self).__init__() + self.tokens = [] + self.char_registry = DEFAULT_CHAR_REGISTRY + self.normalizers: Tuple[ReferenceExcluder] = ( + ReferenceExcluder(char_registry=self.char_registry), + ) + self.re_split_step_one = re.compile( + rf"(?:{self.normalizers[0].re.pattern})|({self.re_in_non_amb.pattern}|\s|\.{2,}|{self.re_tags.pattern})" + ) + + @staticmethod + def _sentence_tokenizer_merge_matches(match): + """ Best way we found to deal with repeating groups""" + start, end = match.span() + return match.string[start:end] + "" + + def _real_sentence_tokenizer(self, string: str) -> List[str]: + string = _SpaceNormalizer.sub(" ", string.strip()) + string = self._sentence_boundaries.sub(self._sentence_tokenizer_merge_matches, string) + + for normalizer in self.normalizers: + string = normalizer.after_sentence_tokenizer(string) + + return string.split("") + + def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]: + """ + Segments a string into a list of tokens by applying Occitan-specific regular expressions. + + :param text: string, ideally one single segment. + :returns: list of segmented tokens + """ + res = [] + # Normalize apostrophe of qu' d' l' + # ToDo: Unclear if we should not simply use the regulizer for apostrophes... + # ToDo: Unclear if it is not already taken care of by the rest of the regexp + text = re.sub(rf"((?:qu)|[dl])[{_APO}]", r"\1' ", text) + text = re.sub(r'(\d)\s(\d)', r'\1\2', text) + for m in self.re_split_step_one.split(text): + if not m or not m.strip(): + continue + elif self.normalizers[0].re.match(m): + res.append(m) + elif self.re_split_match.match(m): + res.append(m) + else: + m = re.sub(r"(-[nz]-)(\P{L}*)", r"\t\1\t\2", m, flags=re.IGNORECASE) # pas d'espace + m = re.sub(r"(\P{L}|^)"+rf"([dlmnst][{_APO}])", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant + m = re.sub(r"(\P{L}|^)(\p{L}*[qnv][us]"+rf"[{_APO}])", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant + m = re.sub(r"(\P{L}|^)(\p{L}*"+rf"qu[{_APO}])", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant # TODO Duplicate of [qnv][us]' ? + m = re.sub(r"(\P{L}|^)(\p{L}*"+rf"ent[{_APO}])", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant + m = re.sub(r"(\P{L}|^)(\p{L}*"+rf"[çcbzu][{_APO}])", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant # TODO Merge with [dlmnst] ? + m = re.sub(r"([\p{L}\d]+(\.[\p{L}\d]+)+)", r"\t\1\t", m) # espace avant et après + m = re.sub(r"\.($|\P{L})", r"\t.\1", m) + m = re.sub(r"(\D|^),", r"\1\t,\t", m) + m = re.sub(r",($|\D)", r"\t,\t\1", m) + m = re.sub(rf"-(vos|ne|[st][eu]?[{_APO}]?|l[aoi{_APO}]s?|me|d[{_APO}]|en|[nv]os|u)"+r"($|\P{L})", r"\t-\1\t\2", m, flags=re.IGNORECASE) # espace après # TODO Try to simplify ? + m = re.sub(rf"[{_APO}]"+r"([unv]\p{L}*)($|\P{L})", rf"\t'\1\t\2", m, flags=re.IGNORECASE) # règle pour 'u 'us 'n 'v 'ns 'vs... # espace après + m = re.sub(rf"[{_APO}]"r"([dlmnsti])($|\P{L})", r"\t'\1\t\2", m, flags=re.IGNORECASE) # règle pour 'm 't 'i 's 'ac ... # espace après + m = re.sub(r"(\p{P})(\p{P})", r"\t\1\t\2\t", m) + m = re.sub(r"", ' ', m) + m = re.sub(r"([<>])", r"\t\1\t", m) + res.extend(m.split('\t')) + + # Remove empty tokens + res = [item for item in res if item.strip()] + return res + + def normalizer(self, data: str) -> str: + for excluder in self.normalizers: + data = excluder.before_sentence_tokenizer(data) + return data + + def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]: + sentences = list() + data = self.normalizer(text) + for sent in self._real_sentence_tokenizer(data): + sent = sent.strip() + if sent: + sentences.append(self.word_tokenizer(sent)) + yield from sentences + + def replacer(self, inp: str): + for excluder in self.normalizers: + if excluder.exclude_regexp.match(inp): + if excluder.can_be_replaced: + return inp + + return unicodedata.normalize("NFKC", inp) diff --git a/pie_extended/pipeline/tokenizers/utils/excluder.py b/pie_extended/pipeline/tokenizers/utils/excluder.py index f39c384..7967f1a 100644 --- a/pie_extended/pipeline/tokenizers/utils/excluder.py +++ b/pie_extended/pipeline/tokenizers/utils/excluder.py @@ -293,9 +293,10 @@ def __init__(self, match_apostrophes=chars.APOSTROPHE, char_registry: Optional[CharRegistry] = None, add_space_after: bool = True, - add_space_before: bool = False): + add_space_before: bool = False, + regex: Optional[str] = None): self.apostrophes = match_apostrophes - self.re: re.Regex = re.compile(r"(\w+)([" + self.apostrophes + r"])(\w+)") + self.re: re.Regex = re.compile(regex or (r"(\w+)([" + self.apostrophes + r"])(\w+)")) self.char_registry = char_registry or CharRegistry() # Space handling