Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[new models] add three new Armenian models #45

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
/*.tsv
pie_extended/downloads/*
tests/**/*.txt
run.py
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ The current system provide an easier access to adding **customized**:
- Early Modern French (Model: `freem`)
- Classical French (Model: `fr`)
- Old Dutch (Model: `dum`)
- Eastern Armenian (Model: `hy_hye`)
- Western Armenian (Model: `hy_hyw`)
- Classical Armenian (Model: `hy_xcl`)

If you trained models and want some help sharing them with Pie Extended, open an issue :)

Expand Down
69 changes: 69 additions & 0 deletions pie_extended/models/hy_hye/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from ...utils import Metadata, File, get_path


MODEL_NAME = "hy_hye"

DESC = Metadata(
"Eastern Armenian",
MODEL_NAME,
["Chahan Vidal-Gorène", "Nadi Tomeh", "Victoria Khurshudyan"],
"Pie Model for Lemmatization, POS Tagging, and Morphological Analysis of Eastern Armenian, trained using Universal Dependencies for Eastern Armenian.",
"https://aclanthology.org/2024.nlp4dh-1.42/"
)
VERSION ="v1.0.0"
DOWNLOADS = [
File("https://zenodo.org/records/14059437/files/hye-ud-abbr-2024_11_04-21_35_36.tar?download=1", "abbr.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-adptype-2024_11_04-21_39_05.tar?download=1", "adptype.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-animacy-2024_11_04-21_44_47.tar?download=1", "animacy.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-aspect-2024_11_04-21_50_42.tar?download=1", "aspect.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-case-2024_11_04-21_56_25.tar?download=1", "case.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-definite-2024_11_04-22_02_34.tar?download=1", "definite.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-degree-2024_11_04-22_03_22.tar?download=1", "degree.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-lemma-2024_11_04-22_13_33.tar?download=1", "lemma.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-mood-2024_11_04-22_19_24.tar?download=1", "mood.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-nametype-2024_11_04-22_21_58.tar?download=1", "nametype.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-number-2024_11_04-22_27_32.tar?download=1", "number.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-number-psor-2024_11_04-22_28_20.tar?download=1", "number-psor.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-numform-2024_11_04-22_29_36.tar?download=1", "numform.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-numtype-2024_11_04-22_33_42.tar?download=1", "numtype.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-person-2024_11_04-22_39_19.tar?download=1", "person.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-person-psor-2024_11_04-22_40_06.tar?download=1", "person-psor.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-polarity-2024_11_04-22_43_46.tar?download=1", "polarity.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-polite-2024_11_04-22_44_32.tar?download=1", "polite.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-pos-2024_11_04-22_49_22.tar?download=1", "pos.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-poss-2024_11_04-22_51_44.tar?download=1", "poss.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-prontype-2024_11_04-22_57_28.tar?download=1", "prontype.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-subcat-2024_11_04-23_03_26.tar?download=1", "subcat.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-tense-2024_11_04-23_08_31.tar?download=1", "tense.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-verbform-2024_11_04-23_13_30.tar?download=1", "verbform.tar"),
File("https://zenodo.org/records/14059437/files/hye-ud-voice-2024_11_04-23_19_22.tar?download=1", "voice.tar")
]


Models = "<{},lemma><{},pos><{},abbr><{},adptype><{},animacy><{},aspect><{},case><{},definite><{},degree><{},mood><{},nametype><{},number><{},number[psor]><{},numform><{},numtype><{},person><{},person[psor]><{},polarity><{},polite><{},poss><{},prontype><{},subcat><{},tense><{},verbform><{},voice>".format(
get_path(MODEL_NAME, "lemma.tar"),
get_path(MODEL_NAME, "pos.tar"),
get_path(MODEL_NAME, "abbr.tar"),
get_path(MODEL_NAME, "adptype.tar"),
get_path(MODEL_NAME, "animacy.tar"),
get_path(MODEL_NAME, "aspect.tar"),
get_path(MODEL_NAME, "case.tar"),
get_path(MODEL_NAME, "definite.tar"),
get_path(MODEL_NAME, "degree.tar"),
get_path(MODEL_NAME, "mood.tar"),
get_path(MODEL_NAME, "nametype.tar"),
get_path(MODEL_NAME, "number.tar"),
get_path(MODEL_NAME, "number-psor.tar"),
get_path(MODEL_NAME, "numform.tar"),
get_path(MODEL_NAME, "numtype.tar"),
get_path(MODEL_NAME, "person.tar"),
get_path(MODEL_NAME, "person-psor.tar"),
get_path(MODEL_NAME, "polarity.tar"),
get_path(MODEL_NAME, "polite.tar"),
get_path(MODEL_NAME, "poss.tar"),
get_path(MODEL_NAME, "prontype.tar"),
get_path(MODEL_NAME, "subcat.tar"),
get_path(MODEL_NAME, "tense.tar"),
get_path(MODEL_NAME, "verbform.tar"),
get_path(MODEL_NAME, "voice.tar")
)
14 changes: 14 additions & 0 deletions pie_extended/models/hy_hye/imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype
from pie_extended.pipeline.iterators.proto import DataIterator
from pie_extended.pipeline.tokenizers.simple_tokenizer import LengthTokenizer


def get_iterator_and_processor(max_tokens=256):
tokenizer = LengthTokenizer(35)
processor = ProcessorPrototype()
iterator = DataIterator(
tokenizer=tokenizer,
max_tokens=max_tokens
)
return iterator, processor

78 changes: 78 additions & 0 deletions pie_extended/models/hy_hyw/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from ...utils import Metadata, File, get_path

MODEL_NAME = "hy_hyw"

DESC = Metadata(
"Western Armenian",
MODEL_NAME,
["Chahan Vidal-Gorène", "Nadi Tomeh", "Victoria Khurshudyan"],
"Pie Model for Lemmatization, POS Tagging, and Morphological Analysis of Western Armenian, trained using Universal Dependencies for Western Armenian.",
"https://aclanthology.org/2024.nlp4dh-1.42/"
)

DOWNLOADS = [
File("https://zenodo.org/records/14060082/files/hyw-ud-abbr-2024_11_04-23_21_46.tar?download=1", "abbr.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-adptype-2024_11_04-23_32_38.tar?download=1", "adptype.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-animacy-2024_11_04-23_46_09.tar?download=1", "animacy.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-aspect-2024_11_05-00_00_02.tar?download=1", "aspect.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-case-2024_11_05-00_13_40.tar?download=1", "case.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-connegative-2024_11_05-00_15_42.tar?download=1", "connegative.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-definite-2024_11_05-00_26_35.tar?download=1", "definite.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-degree-2024_11_05-00_28_25.tar?download=1", "degree.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-deixis-2024_11_05-00_35_20.tar?download=1", "deixis.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-echo-2024_11_05-00_37_23.tar?download=1", "echo.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-hyph-2024_11_05-00_41_02.tar?download=1", "hyph.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-lemma-2024_11_05-00_54_35.tar?download=1", "lemma.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-mood-2024_11_05-01_03_36.tar?download=1", "mood.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-number-2024_11_05-01_17_57.tar?download=1", "number.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-numform-2024_11_05-01_23_02.tar?download=1", "numform.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-numtype-2024_11_05-01_27_14.tar?download=1", "numtype.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-person-2024_11_05-01_39_52.tar?download=1", "person.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-person-psor-2024_11_05-01_41_42.tar?download=1", "person-psor.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-polarity-2024_11_05-01_54_03.tar?download=1", "polarity.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-polite-2024_11_05-01_55_54.tar?download=1", "polite.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-pos-2024_11_05-02_07_23.tar?download=1", "pos.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-poss-2024_11_05-02_12_00.tar?download=1", "poss.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-prontype-2024_11_05-02_21_41.tar?download=1", "prontype.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-reflex-2024_11_05-02_25_19.tar?download=1", "reflex.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-style-2024_11_05-02_27_10.tar?download=1", "style.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-subcat-2024_11_05-02_41_14.tar?download=1", "subcat.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-tense-2024_11_05-02_51_17.tar?download=1", "tense.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-typo-2024_11_05-02_53_08.tar?download=1", "typo.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-verbform-2024_11_05-03_02_55.tar?download=1", "verbform.tar"),
File("https://zenodo.org/records/14060082/files/hyw-ud-voice-2024_11_05-03_15_54.tar?download=1", "voice.tar")
]


Models = "<{},lemma><{},pos><{},abbr><{},adptype><{},animacy><{},aspect><{},case><{},connegative><{},definite><{},degree><{},deixis><{},echo><{},hyph><{},mood><{},number><{},numform><{},numtype><{},person><{},person[psor]><{},polarity><{},polite><{},poss><{},prontype><{},reflex><{},style><{},subcat><{},tense><{},typo><{},verbform><{},voice>".format(
get_path(MODEL_NAME, "lemma.tar"),
get_path(MODEL_NAME, "pos.tar"),
get_path(MODEL_NAME, "abbr.tar"),
get_path(MODEL_NAME, "adptype.tar"),
get_path(MODEL_NAME, "animacy.tar"),
get_path(MODEL_NAME, "aspect.tar"),
get_path(MODEL_NAME, "case.tar"),
get_path(MODEL_NAME, "connegative.tar"),
get_path(MODEL_NAME, "definite.tar"),
get_path(MODEL_NAME, "degree.tar"),
get_path(MODEL_NAME, "deixis.tar"),
get_path(MODEL_NAME, "echo.tar"),
get_path(MODEL_NAME, "hyph.tar"),
get_path(MODEL_NAME, "mood.tar"),
get_path(MODEL_NAME, "number.tar"),
get_path(MODEL_NAME, "numform.tar"),
get_path(MODEL_NAME, "numtype.tar"),
get_path(MODEL_NAME, "person.tar"),
get_path(MODEL_NAME, "person-psor.tar"),
get_path(MODEL_NAME, "polarity.tar"),
get_path(MODEL_NAME, "polite.tar"),
get_path(MODEL_NAME, "poss.tar"),
get_path(MODEL_NAME, "prontype.tar"),
get_path(MODEL_NAME, "reflex.tar"),
get_path(MODEL_NAME, "style.tar"),
get_path(MODEL_NAME, "subcat.tar"),
get_path(MODEL_NAME, "tense.tar"),
get_path(MODEL_NAME, "typo.tar"),
get_path(MODEL_NAME, "verbform.tar"),
get_path(MODEL_NAME, "voice.tar")
)
14 changes: 14 additions & 0 deletions pie_extended/models/hy_hyw/imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype
from pie_extended.pipeline.iterators.proto import DataIterator
from pie_extended.pipeline.tokenizers.simple_tokenizer import LengthTokenizer


def get_iterator_and_processor(max_tokens=256):
tokenizer = LengthTokenizer(35)
processor = ProcessorPrototype()
iterator = DataIterator(
tokenizer=tokenizer,
max_tokens=max_tokens
)
return iterator, processor

48 changes: 48 additions & 0 deletions pie_extended/models/hy_xcl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from ...utils import Metadata, File, get_path

MODEL_NAME = "hy_xcl"

DESC = Metadata(
"Classical Armenian",
MODEL_NAME,
["Chahan Vidal-Gorène", "Nadi Tomeh", "Victoria Khurshudyan"],
"Pie Model for Lemmatization, POS Tagging, and Morphological Analysis of Classical Armenian, trained using Universal Dependencies for Classical Armenian.",
"https://aclanthology.org/2024.nlp4dh-1.42/"
)

DOWNLOADS = [
File("https://zenodo.org/records/14056139/files/xcl-ud-aspect-2024_11_05-03_24_48.tar?download=1", "aspect.tar"),
File("https://zenodo.org/records/14056139/files/xcl-ud-case-2024_11_05-03_33_09.tar?download=1", "case.tar"),
File("https://zenodo.org/records/14056139/files/xcl-ud-deixis-2024_11_05-03_38_12.tar?download=1", "deixis.tar"),
File("https://zenodo.org/records/14056139/files/xcl-ud-lemma-2024_11_05-03_49_41.tar?download=1", "lemma.tar"),
File("https://zenodo.org/records/14056139/files/xcl-ud-mood-2024_11_05-03_58_02.tar?download=1", "mood.tar"),
File("https://zenodo.org/records/14056139/files/xcl-ud-number-2024_11_05-04_06_32.tar?download=1", "number.tar"),
File("https://zenodo.org/records/14056139/files/xcl-ud-numtype-2024_11_05-04_08_02.tar?download=1", "numtype.tar"),
File("https://zenodo.org/records/14056139/files/xcl-ud-person-2024_11_05-04_15_37.tar?download=1", "person.tar"),
File("https://zenodo.org/records/14056139/files/xcl-ud-pos-2024_11_05-04_24_38.tar?download=1", "pos.tar"),
File("https://zenodo.org/records/14056139/files/xcl-ud-poss-2024_11_05-04_25_45.tar?download=1", "poss.tar"),
File("https://zenodo.org/records/14056139/files/xcl-ud-prontype-2024_11_05-04_31_17.tar?download=1", "prontype.tar"),
File("https://zenodo.org/records/14056139/files/xcl-ud-reflex-2024_11_05-04_32_26.tar?download=1", "reflex.tar"),
File("https://zenodo.org/records/14056139/files/xcl-ud-tense-2024_11_05-04_41_12.tar?download=1", "tense.tar"),
File("https://zenodo.org/records/14056139/files/xcl-ud-verbform-2024_11_05-04_49_41.tar?download=1", "verbform.tar"),
File("https://zenodo.org/records/14056139/files/xcl-ud-voice-2024_11_05-04_56_34.tar?download=1", "voice.tar")
]


Models = "<{},lemma><{},pos><{},aspect><{},case><{},deixis><{},mood><{},number><{},numtype><{},person><{},poss><{},prontype><{},reflex><{},tense><{},verbform><{},voice>".format(
get_path(MODEL_NAME, "lemma.tar"),
get_path(MODEL_NAME, "pos.tar"),
get_path(MODEL_NAME, "aspect.tar"),
get_path(MODEL_NAME, "case.tar"),
get_path(MODEL_NAME, "deixis.tar"),
get_path(MODEL_NAME, "mood.tar"),
get_path(MODEL_NAME, "number.tar"),
get_path(MODEL_NAME, "numtype.tar"),
get_path(MODEL_NAME, "person.tar"),
get_path(MODEL_NAME, "poss.tar"),
get_path(MODEL_NAME, "prontype.tar"),
get_path(MODEL_NAME, "reflex.tar"),
get_path(MODEL_NAME, "tense.tar"),
get_path(MODEL_NAME, "verbform.tar"),
get_path(MODEL_NAME, "voice.tar")
)
14 changes: 14 additions & 0 deletions pie_extended/models/hy_xcl/imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype
from pie_extended.pipeline.iterators.proto import DataIterator
from pie_extended.pipeline.tokenizers.simple_tokenizer import LengthTokenizer


def get_iterator_and_processor(max_tokens=256):
tokenizer = LengthTokenizer(35)
processor = ProcessorPrototype()
iterator = DataIterator(
tokenizer=tokenizer,
max_tokens=max_tokens
)
return iterator, processor