From 81f04da994926670af82513872029c84a048b282 Mon Sep 17 00:00:00 2001 From: Lucas Terriel Date: Wed, 13 Nov 2024 16:56:29 +0100 Subject: [PATCH 1/3] add three new Armenian (HY) models --- .gitignore | 1 + pie_extended/models/hy_hye/__init__.py | 69 +++++++++++++++++++++++ pie_extended/models/hy_hye/imports.py | 14 +++++ pie_extended/models/hy_hyw/__init__.py | 78 ++++++++++++++++++++++++++ pie_extended/models/hy_hyw/imports.py | 14 +++++ pie_extended/models/hy_xcl/__init__.py | 48 ++++++++++++++++ pie_extended/models/hy_xcl/imports.py | 14 +++++ 7 files changed, 238 insertions(+) create mode 100644 pie_extended/models/hy_hye/__init__.py create mode 100644 pie_extended/models/hy_hye/imports.py create mode 100644 pie_extended/models/hy_hyw/__init__.py create mode 100644 pie_extended/models/hy_hyw/imports.py create mode 100644 pie_extended/models/hy_xcl/__init__.py create mode 100644 pie_extended/models/hy_xcl/imports.py diff --git a/.gitignore b/.gitignore index 615bfb6..5290337 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ /*.tsv pie_extended/downloads/* tests/**/*.txt +run.py # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/pie_extended/models/hy_hye/__init__.py b/pie_extended/models/hy_hye/__init__.py new file mode 100644 index 0000000..248ff90 --- /dev/null +++ b/pie_extended/models/hy_hye/__init__.py @@ -0,0 +1,69 @@ +from ...utils import Metadata, File, get_path + + +MODEL_NAME = "hy_hye" + +DESC = Metadata( + "Eastern Armenian", + MODEL_NAME, + ["Chahan Vidal-Gorène, Nadi Tomeh, Victoria Khurshudyan"], + "Pie Model for Lemmatization, POS Tagging, and Morphological Analysis of Eastern Armenian, trained using Universal Dependencies for Eastern Armenian.", + "https://aclanthology.org/2024.nlp4dh-1.42/" +) +VERSION ="v1.0.0" +DOWNLOADS = [ + File("https://zenodo.org/records/14059437/files/hye-ud-abbr-2024_11_04-21_35_36.tar?download=1", "abbr.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-adptype-2024_11_04-21_39_05.tar?download=1", "adptype.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-animacy-2024_11_04-21_44_47.tar?download=1", "animacy.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-aspect-2024_11_04-21_50_42.tar?download=1", "aspect.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-case-2024_11_04-21_56_25.tar?download=1", "case.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-definite-2024_11_04-22_02_34.tar?download=1", "definite.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-degree-2024_11_04-22_03_22.tar?download=1", "degree.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-lemma-2024_11_04-22_13_33.tar?download=1", "lemma.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-mood-2024_11_04-22_19_24.tar?download=1", "mood.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-nametype-2024_11_04-22_21_58.tar?download=1", "nametype.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-number-2024_11_04-22_27_32.tar?download=1", "number.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-number-psor-2024_11_04-22_28_20.tar?download=1", "number-psor.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-numform-2024_11_04-22_29_36.tar?download=1", "numform.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-numtype-2024_11_04-22_33_42.tar?download=1", "numtype.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-person-2024_11_04-22_39_19.tar?download=1", "person.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-person-psor-2024_11_04-22_40_06.tar?download=1", "person-psor.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-polarity-2024_11_04-22_43_46.tar?download=1", "polarity.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-polite-2024_11_04-22_44_32.tar?download=1", "polite.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-pos-2024_11_04-22_49_22.tar?download=1", "pos.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-poss-2024_11_04-22_51_44.tar?download=1", "poss.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-prontype-2024_11_04-22_57_28.tar?download=1", "prontype.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-subcat-2024_11_04-23_03_26.tar?download=1", "subcat.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-tense-2024_11_04-23_08_31.tar?download=1", "tense.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-verbform-2024_11_04-23_13_30.tar?download=1", "verbform.tar"), + File("https://zenodo.org/records/14059437/files/hye-ud-voice-2024_11_04-23_19_22.tar?download=1", "voice.tar") +] + + +Models = "<{},lemma><{},pos><{},abbr><{},adptype><{},animacy><{},aspect><{},case><{},definite><{},degree><{},mood><{},nametype><{},number><{},number[psor]><{},numform><{},numtype><{},person><{},person[psor]><{},polarity><{},polite><{},poss><{},prontype><{},subcat><{},tense><{},verbform><{},voice>".format( + get_path(MODEL_NAME, "lemma.tar"), + get_path(MODEL_NAME, "pos.tar"), + get_path(MODEL_NAME, "abbr.tar"), + get_path(MODEL_NAME, "adptype.tar"), + get_path(MODEL_NAME, "animacy.tar"), + get_path(MODEL_NAME, "aspect.tar"), + get_path(MODEL_NAME, "case.tar"), + get_path(MODEL_NAME, "definite.tar"), + get_path(MODEL_NAME, "degree.tar"), + get_path(MODEL_NAME, "mood.tar"), + get_path(MODEL_NAME, "nametype.tar"), + get_path(MODEL_NAME, "number.tar"), + get_path(MODEL_NAME, "number-psor.tar"), + get_path(MODEL_NAME, "numform.tar"), + get_path(MODEL_NAME, "numtype.tar"), + get_path(MODEL_NAME, "person.tar"), + get_path(MODEL_NAME, "person-psor.tar"), + get_path(MODEL_NAME, "polarity.tar"), + get_path(MODEL_NAME, "polite.tar"), + get_path(MODEL_NAME, "poss.tar"), + get_path(MODEL_NAME, "prontype.tar"), + get_path(MODEL_NAME, "subcat.tar"), + get_path(MODEL_NAME, "tense.tar"), + get_path(MODEL_NAME, "verbform.tar"), + get_path(MODEL_NAME, "voice.tar") +) \ No newline at end of file diff --git a/pie_extended/models/hy_hye/imports.py b/pie_extended/models/hy_hye/imports.py new file mode 100644 index 0000000..b2698b4 --- /dev/null +++ b/pie_extended/models/hy_hye/imports.py @@ -0,0 +1,14 @@ +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype +from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.tokenizers.simple_tokenizer import LengthTokenizer + + +def get_iterator_and_processor(max_tokens=256): + tokenizer = LengthTokenizer(35) + processor = ProcessorPrototype() + iterator = DataIterator( + tokenizer=tokenizer, + max_tokens=max_tokens + ) + return iterator, processor + diff --git a/pie_extended/models/hy_hyw/__init__.py b/pie_extended/models/hy_hyw/__init__.py new file mode 100644 index 0000000..8ae97ef --- /dev/null +++ b/pie_extended/models/hy_hyw/__init__.py @@ -0,0 +1,78 @@ +from ...utils import Metadata, File, get_path + +MODEL_NAME = "hy_hyw" + +DESC = Metadata( + "Western Armenian", + MODEL_NAME, + ["Chahan Vidal-Gorène", "Nadi Tomeh", "Victoria Khurshudyan"], + "Pie Model for Lemmatization, POS Tagging, and Morphological Analysis of Western Armenian, trained using Universal Dependencies for Western Armenian.", + "https://aclanthology.org/2024.nlp4dh-1.42/" +) + +DOWNLOADS = [ + File("https://zenodo.org/records/14060082/files/hyw-ud-abbr-2024_11_04-23_21_46.tar?download=1", "abbr.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-adptype-2024_11_04-23_32_38.tar?download=1", "adptype.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-animacy-2024_11_04-23_46_09.tar?download=1", "animacy.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-aspect-2024_11_05-00_00_02.tar?download=1", "aspect.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-case-2024_11_05-00_13_40.tar?download=1", "case.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-connegative-2024_11_05-00_15_42.tar?download=1", "connegative.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-definite-2024_11_05-00_26_35.tar?download=1", "definite.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-degree-2024_11_05-00_28_25.tar?download=1", "degree.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-deixis-2024_11_05-00_35_20.tar?download=1", "deixis.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-echo-2024_11_05-00_37_23.tar?download=1", "echo.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-hyph-2024_11_05-00_41_02.tar?download=1", "hyph.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-lemma-2024_11_05-00_54_35.tar?download=1", "lemma.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-mood-2024_11_05-01_03_36.tar?download=1", "mood.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-number-2024_11_05-01_17_57.tar?download=1", "number.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-numform-2024_11_05-01_23_02.tar?download=1", "numform.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-numtype-2024_11_05-01_27_14.tar?download=1", "numtype.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-person-2024_11_05-01_39_52.tar?download=1", "person.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-person-psor-2024_11_05-01_41_42.tar?download=1", "person-psor.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-polarity-2024_11_05-01_54_03.tar?download=1", "polarity.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-polite-2024_11_05-01_55_54.tar?download=1", "polite.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-pos-2024_11_05-02_07_23.tar?download=1", "pos.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-poss-2024_11_05-02_12_00.tar?download=1", "poss.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-prontype-2024_11_05-02_21_41.tar?download=1", "prontype.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-reflex-2024_11_05-02_25_19.tar?download=1", "reflex.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-style-2024_11_05-02_27_10.tar?download=1", "style.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-subcat-2024_11_05-02_41_14.tar?download=1", "subcat.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-tense-2024_11_05-02_51_17.tar?download=1", "tense.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-typo-2024_11_05-02_53_08.tar?download=1", "typo.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-verbform-2024_11_05-03_02_55.tar?download=1", "verbform.tar"), + File("https://zenodo.org/records/14060082/files/hyw-ud-voice-2024_11_05-03_15_54.tar?download=1", "voice.tar") +] + + +Models = "<{},lemma><{},pos><{},abbr><{},adptype><{},animacy><{},aspect><{},case><{},connegative><{},definite><{},degree><{},deixis><{},echo><{},hyph><{},mood><{},number><{},numform><{},numtype><{},person><{},person[psor]><{},polarity><{},polite><{},poss><{},prontype><{},reflex><{},style><{},subcat><{},tense><{},typo><{},verbform><{},voice>".format( + get_path(MODEL_NAME, "lemma.tar"), + get_path(MODEL_NAME, "pos.tar"), + get_path(MODEL_NAME, "abbr.tar"), + get_path(MODEL_NAME, "adptype.tar"), + get_path(MODEL_NAME, "animacy.tar"), + get_path(MODEL_NAME, "aspect.tar"), + get_path(MODEL_NAME, "case.tar"), + get_path(MODEL_NAME, "connegative.tar"), + get_path(MODEL_NAME, "definite.tar"), + get_path(MODEL_NAME, "degree.tar"), + get_path(MODEL_NAME, "deixis.tar"), + get_path(MODEL_NAME, "echo.tar"), + get_path(MODEL_NAME, "hyph.tar"), + get_path(MODEL_NAME, "mood.tar"), + get_path(MODEL_NAME, "number.tar"), + get_path(MODEL_NAME, "numform.tar"), + get_path(MODEL_NAME, "numtype.tar"), + get_path(MODEL_NAME, "person.tar"), + get_path(MODEL_NAME, "person-psor.tar"), + get_path(MODEL_NAME, "polarity.tar"), + get_path(MODEL_NAME, "polite.tar"), + get_path(MODEL_NAME, "poss.tar"), + get_path(MODEL_NAME, "prontype.tar"), + get_path(MODEL_NAME, "reflex.tar"), + get_path(MODEL_NAME, "style.tar"), + get_path(MODEL_NAME, "subcat.tar"), + get_path(MODEL_NAME, "tense.tar"), + get_path(MODEL_NAME, "typo.tar"), + get_path(MODEL_NAME, "verbform.tar"), + get_path(MODEL_NAME, "voice.tar") +) \ No newline at end of file diff --git a/pie_extended/models/hy_hyw/imports.py b/pie_extended/models/hy_hyw/imports.py new file mode 100644 index 0000000..b2698b4 --- /dev/null +++ b/pie_extended/models/hy_hyw/imports.py @@ -0,0 +1,14 @@ +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype +from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.tokenizers.simple_tokenizer import LengthTokenizer + + +def get_iterator_and_processor(max_tokens=256): + tokenizer = LengthTokenizer(35) + processor = ProcessorPrototype() + iterator = DataIterator( + tokenizer=tokenizer, + max_tokens=max_tokens + ) + return iterator, processor + diff --git a/pie_extended/models/hy_xcl/__init__.py b/pie_extended/models/hy_xcl/__init__.py new file mode 100644 index 0000000..0266b0f --- /dev/null +++ b/pie_extended/models/hy_xcl/__init__.py @@ -0,0 +1,48 @@ +from ...utils import Metadata, File, get_path + +MODEL_NAME = "hy_xcl" + +DESC = Metadata( + "Classical Armenian", + MODEL_NAME, + ["Chahan Vidal-Gorène", "Nadi Tomeh", "Victoria Khurshudyan"], + "Pie Model for Lemmatization, POS Tagging, and Morphological Analysis of Classical Armenian, trained using Universal Dependencies for Classical Armenian.", + "https://aclanthology.org/2024.nlp4dh-1.42/" +) + +DOWNLOADS = [ + File("https://zenodo.org/records/14056139/files/xcl-ud-aspect-2024_11_05-03_24_48.tar?download=1", "aspect.tar"), + File("https://zenodo.org/records/14056139/files/xcl-ud-case-2024_11_05-03_33_09.tar?download=1", "case.tar"), + File("https://zenodo.org/records/14056139/files/xcl-ud-deixis-2024_11_05-03_38_12.tar?download=1", "deixis.tar"), + File("https://zenodo.org/records/14056139/files/xcl-ud-lemma-2024_11_05-03_49_41.tar?download=1", "lemma.tar"), + File("https://zenodo.org/records/14056139/files/xcl-ud-mood-2024_11_05-03_58_02.tar?download=1", "mood.tar"), + File("https://zenodo.org/records/14056139/files/xcl-ud-number-2024_11_05-04_06_32.tar?download=1", "number.tar"), + File("https://zenodo.org/records/14056139/files/xcl-ud-numtype-2024_11_05-04_08_02.tar?download=1", "numtype.tar"), + File("https://zenodo.org/records/14056139/files/xcl-ud-person-2024_11_05-04_15_37.tar?download=1", "person.tar"), + File("https://zenodo.org/records/14056139/files/xcl-ud-pos-2024_11_05-04_24_38.tar?download=1", "pos.tar"), + File("https://zenodo.org/records/14056139/files/xcl-ud-poss-2024_11_05-04_25_45.tar?download=1", "poss.tar"), + File("https://zenodo.org/records/14056139/files/xcl-ud-prontype-2024_11_05-04_31_17.tar?download=1", "prontype.tar"), + File("https://zenodo.org/records/14056139/files/xcl-ud-reflex-2024_11_05-04_32_26.tar?download=1", "reflex.tar"), + File("https://zenodo.org/records/14056139/files/xcl-ud-tense-2024_11_05-04_41_12.tar?download=1", "tense.tar"), + File("https://zenodo.org/records/14056139/files/xcl-ud-verbform-2024_11_05-04_49_41.tar?download=1", "verbform.tar"), + File("https://zenodo.org/records/14056139/files/xcl-ud-voice-2024_11_05-04_56_34.tar?download=1", "voice.tar") +] + + +Models = "<{},lemma><{},pos><{},aspect><{},case><{},deixis><{},mood><{},number><{},numtype><{},person><{},poss><{},prontype><{},reflex><{},tense><{},verbform><{},voice>".format( + get_path(MODEL_NAME, "lemma.tar"), + get_path(MODEL_NAME, "pos.tar"), + get_path(MODEL_NAME, "aspect.tar"), + get_path(MODEL_NAME, "case.tar"), + get_path(MODEL_NAME, "deixis.tar"), + get_path(MODEL_NAME, "mood.tar"), + get_path(MODEL_NAME, "number.tar"), + get_path(MODEL_NAME, "numtype.tar"), + get_path(MODEL_NAME, "person.tar"), + get_path(MODEL_NAME, "poss.tar"), + get_path(MODEL_NAME, "prontype.tar"), + get_path(MODEL_NAME, "reflex.tar"), + get_path(MODEL_NAME, "tense.tar"), + get_path(MODEL_NAME, "verbform.tar"), + get_path(MODEL_NAME, "voice.tar") +) \ No newline at end of file diff --git a/pie_extended/models/hy_xcl/imports.py b/pie_extended/models/hy_xcl/imports.py new file mode 100644 index 0000000..b2698b4 --- /dev/null +++ b/pie_extended/models/hy_xcl/imports.py @@ -0,0 +1,14 @@ +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype +from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.tokenizers.simple_tokenizer import LengthTokenizer + + +def get_iterator_and_processor(max_tokens=256): + tokenizer = LengthTokenizer(35) + processor = ProcessorPrototype() + iterator = DataIterator( + tokenizer=tokenizer, + max_tokens=max_tokens + ) + return iterator, processor + From 9fdb3b0dbc8c3631774019168e6ad851ade6aee3 Mon Sep 17 00:00:00 2001 From: Lucas Terriel Date: Wed, 13 Nov 2024 16:58:17 +0100 Subject: [PATCH 2/3] add three new Armenian (HY) models - fix typo for hy_hye --- pie_extended/models/hy_hye/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pie_extended/models/hy_hye/__init__.py b/pie_extended/models/hy_hye/__init__.py index 248ff90..fcea116 100644 --- a/pie_extended/models/hy_hye/__init__.py +++ b/pie_extended/models/hy_hye/__init__.py @@ -6,7 +6,7 @@ DESC = Metadata( "Eastern Armenian", MODEL_NAME, - ["Chahan Vidal-Gorène, Nadi Tomeh, Victoria Khurshudyan"], + ["Chahan Vidal-Gorène", "Nadi Tomeh", "Victoria Khurshudyan"], "Pie Model for Lemmatization, POS Tagging, and Morphological Analysis of Eastern Armenian, trained using Universal Dependencies for Eastern Armenian.", "https://aclanthology.org/2024.nlp4dh-1.42/" ) From 3864845cd1a848c8a539e9deda65790d7843836b Mon Sep 17 00:00:00 2001 From: Lucas Terriel Date: Wed, 13 Nov 2024 17:02:06 +0100 Subject: [PATCH 3/3] add three new Armenian (HY) models - update Readme --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 9dabd82..20700e6 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,9 @@ The current system provide an easier access to adding **customized**: - Early Modern French (Model: `freem`) - Classical French (Model: `fr`) - Old Dutch (Model: `dum`) +- Eastern Armenian (Model: `hy_hye`) +- Western Armenian (Model: `hy_hyw`) +- Classical Armenian (Model: `hy_xcl`) If you trained models and want some help sharing them with Pie Extended, open an issue :)