Skip to content

Commit

Permalink
Adding an alpha version of Occitan Contemporain by COLaF
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed May 30, 2024
1 parent 7d37a36 commit f9ca058
Show file tree
Hide file tree
Showing 6 changed files with 180 additions and 3 deletions.
3 changes: 2 additions & 1 deletion pie_extended/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
"fr",
"freem",
"grc",
"dum"
"dum",
"occ_cont"
]
25 changes: 25 additions & 0 deletions pie_extended/models/occ_cont/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from ...utils import Metadata, File, get_path


DESC = Metadata(
"OccitanContemporain",
"occ_cont",
["Oriane Nédey", "Juliette Janès"],
"Model trained on ...",
"https://github.com/DEFI-COLaF/modeles-papie"
)

VERSION = "v0.0.1"
DOWNLOADS = [
File("https://github.com/DEFI-COLaF/modeles-papie/releases/download/" + VERSION +
"/PaPie_Lemma_finetune-WIKI2TTB-v0.0.1.tar",
"lemma.tar"),
File("https://github.com/DEFI-COLaF/modeles-papie/releases/download/" + VERSION +
"/PaPie_POS_WIKITTB-v0.0.1.tar",
"pos.tar"),
]

Models = "".join([
"<{},lemma>".format(get_path("occ_cont", "lemma.tar")),
"<{},pos>".format(get_path("occ_cont", "pos.tar"))
])
23 changes: 23 additions & 0 deletions pie_extended/models/occ_cont/imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from pie_extended.pipeline.iterators.proto import DataIterator, GenericExcludePatterns
from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor
from pie_extended.models.occ_cont.tokenizer import OccMemorizingTokenizer
from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype


def get_iterator_and_processor(max_tokens=256):
tokenizer = OccMemorizingTokenizer()
processor = MemoryzingProcessor(
tokenizer_memory=tokenizer,
head_processor=ProcessorPrototype()
)
iterator = DataIterator(
tokenizer=tokenizer,
max_tokens=max_tokens,
exclude_patterns=[
excl.exclude_regexp
for excl in tokenizer.normalizers
if excl.exclude_regexp
]
)
return iterator, processor

Empty file.
127 changes: 127 additions & 0 deletions pie_extended/models/occ_cont/tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import regex as re
from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer
from typing import List, Generator, Tuple
import unicodedata
from pie_extended.pipeline.tokenizers.utils.excluder import (
ReferenceExcluder,
DEFAULT_CHAR_REGISTRY,
ApostropheExcluder,
chars
)

_Dots_except_apostrophe = r".?!\"“”\"«»…\[\]\(\)„“"
_SpaceNormalizer = re.compile(r"(\s+)")
_APO = chars.APOSTROPHE


class OccMemorizingTokenizer(MemorizingTokenizer):
""" Occitan Tokenizer with memorizing capacities (for normalization steps)
This tokenizer is based on a Perl script published by Marianne Verges-Couret in 2019
(https://zenodo.org/records/2533873), as well as the description for the Python tokenizer described in
(Miletić, 2023) that was also derived from the work in project RESTAURE.
It was adapted by Oriane Nedey in python and then adapted to Pie-Extended
"""
_sentence_boundaries = re.compile(
r"([" + _Dots_except_apostrophe + r"]+\s*)+"
)
re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)")

# Define a pattern that matches any punctuation or symbol, with exceptions
re_in_non_amb = re.compile(rf"(?![{_APO}\-,.<>])"+r"[\p{P}\p{S}]")

# Define a pattern that matches (XML/HTML...) tags # ToDO check that this change is ok
re_tags = re.compile(r'(<\\?[^\d\s].*>)')

re_split_match = re.compile(rf"(\.{2,})|({re_in_non_amb.pattern})|{re_tags.pattern}")

def __init__(self):
super(OccMemorizingTokenizer, self).__init__()
self.tokens = []
self.char_registry = DEFAULT_CHAR_REGISTRY
self.normalizers: Tuple[ReferenceExcluder] = (
ReferenceExcluder(char_registry=self.char_registry),
)
self.re_split_step_one = re.compile(
rf"(?:{self.normalizers[0].re.pattern})|({self.re_in_non_amb.pattern}|\s|\.{2,}|{self.re_tags.pattern})"
)

@staticmethod
def _sentence_tokenizer_merge_matches(match):
""" Best way we found to deal with repeating groups"""
start, end = match.span()
return match.string[start:end] + "<SPLIT>"

def _real_sentence_tokenizer(self, string: str) -> List[str]:
string = _SpaceNormalizer.sub(" ", string.strip())
string = self._sentence_boundaries.sub(self._sentence_tokenizer_merge_matches, string)

for normalizer in self.normalizers:
string = normalizer.after_sentence_tokenizer(string)

return string.split("<SPLIT>")

def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]:
"""
Segments a string into a list of tokens by applying Occitan-specific regular expressions.
:param text: string, ideally one single segment.
:returns: list of segmented tokens
"""
res = []
# Normalize apostrophe of qu' d' l'
# ToDo: Unclear if we should not simply use the regulizer for apostrophes...
# ToDo: Unclear if it is not already taken care of by the rest of the regexp
text = re.sub(rf"((?:qu)|[dl])[{_APO}]", r"\1' ", text)
text = re.sub(r'(\d)\s(\d)', r'\1<PPLesp>\2', text)
for m in self.re_split_step_one.split(text):
if not m or not m.strip():
continue
elif self.normalizers[0].re.match(m):
res.append(m)
elif self.re_split_match.match(m):
res.append(m)
else:
m = re.sub(r"(-[nz]-)(\P{L}*)", r"\t\1\t\2", m, flags=re.IGNORECASE) # pas d'espace
m = re.sub(r"(\P{L}|^)"+rf"([dlmnst][{_APO}])", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant
m = re.sub(r"(\P{L}|^)(\p{L}*[qnv][us]"+rf"[{_APO}])", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant
m = re.sub(r"(\P{L}|^)(\p{L}*"+rf"qu[{_APO}])", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant # TODO Duplicate of [qnv][us]' ?
m = re.sub(r"(\P{L}|^)(\p{L}*"+rf"ent[{_APO}])", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant
m = re.sub(r"(\P{L}|^)(\p{L}*"+rf"[çcbzu][{_APO}])", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant # TODO Merge with [dlmnst] ?
m = re.sub(r"([\p{L}\d]+(\.[\p{L}\d]+)+)", r"\t\1\t", m) # espace avant et après
m = re.sub(r"\.($|\P{L})", r"\t.\1", m)
m = re.sub(r"(\D|^),", r"\1\t,\t", m)
m = re.sub(r",($|\D)", r"\t,\t\1", m)
m = re.sub(rf"-(vos|ne|[st][eu]?[{_APO}]?|l[aoi{_APO}]s?|me|d[{_APO}]|en|[nv]os|u)"+r"($|\P{L})", r"\t-\1\t\2", m, flags=re.IGNORECASE) # espace après # TODO Try to simplify ?
m = re.sub(rf"[{_APO}]"+r"([unv]\p{L}*)($|\P{L})", rf"\t'\1\t\2", m, flags=re.IGNORECASE) # règle pour 'u 'us 'n 'v 'ns 'vs... # espace après
m = re.sub(rf"[{_APO}]"r"([dlmnsti])($|\P{L})", r"\t'\1\t\2", m, flags=re.IGNORECASE) # règle pour 'm 't 'i 's 'ac ... # espace après
m = re.sub(r"(\p{P})(\p{P})", r"\t\1\t\2\t", m)
m = re.sub(r"<PPLesp>", ' ', m)
m = re.sub(r"([<>])", r"\t\1\t", m)
res.extend(m.split('\t'))

# Remove empty tokens
res = [item for item in res if item.strip()]
return res

def normalizer(self, data: str) -> str:
for excluder in self.normalizers:
data = excluder.before_sentence_tokenizer(data)
return data

def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]:
sentences = list()
data = self.normalizer(text)
for sent in self._real_sentence_tokenizer(data):
sent = sent.strip()
if sent:
sentences.append(self.word_tokenizer(sent))
yield from sentences

def replacer(self, inp: str):
for excluder in self.normalizers:
if excluder.exclude_regexp.match(inp):
if excluder.can_be_replaced:
return inp

return unicodedata.normalize("NFKC", inp)
5 changes: 3 additions & 2 deletions pie_extended/pipeline/tokenizers/utils/excluder.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,9 +293,10 @@ def __init__(self,
match_apostrophes=chars.APOSTROPHE,
char_registry: Optional[CharRegistry] = None,
add_space_after: bool = True,
add_space_before: bool = False):
add_space_before: bool = False,
regex: Optional[str] = None):
self.apostrophes = match_apostrophes
self.re: re.Regex = re.compile(r"(\w+)([" + self.apostrophes + r"])(\w+)")
self.re: re.Regex = re.compile(regex or (r"(\w+)([" + self.apostrophes + r"])(\w+)"))
self.char_registry = char_registry or CharRegistry()

# Space handling
Expand Down

0 comments on commit f9ca058

Please sign in to comment.