Skip to content

Commit

Permalink
more handparse rules
Browse files Browse the repository at this point in the history
  • Loading branch information
Jemoka committed Aug 3, 2024
1 parent 6ae51b4 commit 2035aeb
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 13 deletions.
62 changes: 62 additions & 0 deletions batchalign/pipelines/morphosyntax/ja/verbforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,68 @@
"""

def verbform(upos, target, text):
if "撮る" in text:
return "verb", "撮る"
if "貼る" in text:
return "verb", "貼る"
if "混ぜ" in text:
return "verb", "混ぜる"
if "釣る" in text:
return "verb", "釣る"
if "速い" in text and upos == "adj":
return "adj", "速い"
if "治ま" in text:
return "verb", "治まる"
if "刺す" in text:
return "verb", "刺す"
if "降り" in text:
return "verb", "降りる"
if "降" in text:
return "verb", "降る"
if "載せ" in text:
return "verb", "載せる"
if "帰" in text:
return "verb", "帰る"
if "はい" in text:
return "intj", "はい"
if "うん" in text:
return "intj", "うん"
if "おっ" in text:
return "intj", "おっ"
if "ほら" in text:
return "intj", "ほら"
if "ヤッホー" in text:
return "intj", "ヤッホー"
if "ただいま" in text:
return "intj", "ただいま"
if "あたし" in text:
return "pron", "あたし"
if "舐め" in text:
return "verb", "舐める"
if "バツ" in text:
return "noun", "バツ"
if "ブラシ" in text:
return "noun", "ブラシ"
if "引き出し" in text:
return "noun", "引き出し"
if "下さい" in text:
return "noun", "下さい"
if target in ["シャャミー", "物コャミ"]:
return "noun", "クシャミ"
if "マヨネーズ" in text:
return "noun", "マヨネーズ"
if "マヨ" in text:
return "noun", "マヨ"
if "チップス" in text:
return "noun", "チップス"
if "ゴロンっ" in text:
return "noun", "ゴロンっ"
if "モチーンっ" in text:
return "noun", "モチーンっ"
if "人っ" == text:
return "noun", "人"
if text == "掻く":
return "part", "かい"
if "遣" in text and upos == "noun":
return "verb", "遣る"
if "死" in text:
Expand Down
7 changes: 6 additions & 1 deletion batchalign/pipelines/morphosyntax/ud.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,8 @@ def handler__VERB(word, lang=None):
res = handler(word, lang)
if "sconj" in res:
return res
elif "verb" not in res:
return res
else:
return res+flag+stringify_feats(aspect, mood,
tense, polarity, polite,
Expand Down Expand Up @@ -266,7 +268,10 @@ def handler__PUNCT(word, lang=None):
return "noun|da"
elif re.match(r"^['\w-]+$", word.text): # we match text here because .text is the ultumate content
# instead of the lemma, which maybe entirely weird
return f"x|{word.text}"
if word.text == "もん":
return f"part|{word.text}"
else:
return f"x|{word.text}"

# Register handlers
HANDLERS = {
Expand Down
6 changes: 3 additions & 3 deletions batchalign/version
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
0.7.3-beta.15
July 29th, 2024
Correct Unicode Escapes?
0.7.3-beta.16
August 3rd, 2024
more Japanese hand-parse rules
21 changes: 12 additions & 9 deletions scratchpad.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,20 +100,23 @@

# text = "ice ice cream ice cream"

# function = "morphosyntax"
# lang = "eng"
# num_speakers = 1
function = "morphosyntax"
lang = "jpn"
num_speakers = 1

# forms, delim = chat_parse_utterance("why do I care ?", None, None, None, None)
# utterance = Utterance(content=forms, delim=delim, text="why do I care ?")

ut = "色が変わる飴舐めてる ."

# ut = Document(content=[utterance], langs=["eng"])
forms, delim = chat_parse_utterance(ut, None, None, None, None)
utterance = Utterance(content=forms, delim=delim, text=ut)

# pipeline = BatchalignPipeline.new("morphosyntax", lang="eng")
# res = pipeline(ut, retokenize=True)

# print(str(CHATFile(doc=res)))
ut = Document(content=[utterance], langs=[lang])

pipeline = BatchalignPipeline.new("morphosyntax", lang=lang)
res = pipeline(ut, retokenize=True)

print(str(CHATFile(doc=res)))

# print(u"\u202bwhat up with that?")
# print("אויתאויונסתהאויסו".encode().decode("").encode().decode("utf-8"))
Expand Down

0 comments on commit 2035aeb

Please sign in to comment.