-
Notifications
You must be signed in to change notification settings - Fork 9
/
merge-names.rlx
160 lines (133 loc) · 5.77 KB
/
merge-names.rlx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# -*- cg-pre-pipe: "apertium -d . nob-nno-lawmerge | cg-conv a" -*-
DELIMITERS = "." "?" sent ;
# TODO: "Lene" "Westgaard+Halle" – merge compound last-names
# Example input:
# "<Rolv>"
# "Rolv" np ant m Aa @subst→
# "<Ravlo>"
# "Ravlo" np cog Aa @app
# "<.>"
# "." sent clb aa
# "<Rolv>"
# "Rolv" np ant m Aa @subst→
# "<Molv>"
# "Molv" np ant m Aa @subst→
# "<Ravlo>"
# "Ravlo" np cog Aa @app
# "<.>"
# "." sent clb aa
#
# Expected output:
# "<Rolv Ravlo>"
# "Rolv Ravlo" np cog cgguess Aa @app
# "<.>"
# "." sent clb aa
# "<Rolv Molv Ravlo>"
# "Rolv Molv Ravlo" np cog cgguess Aa @app
# "<.>"
# "." sent clb aa
# Restricting to letters will exclude e.g. «Haddy N’jie», but at least then bidix will let it through
# We allow a single space or dash, like "Arild Hareide" or "Bakke-Jensen", but require titlecase on the second part (so "Oslo lufthavn" doesn't go through)
# We also allow the regex to match just a single uppercase letter + single dot, like initial "E."
LIST wf = "<(\\p{Lu}\\p{Ll}+(?:[ -]\\p{Lu}\\p{Ll}+)?|\\p{Lu}[.])>"r ;
LIST bf = "(\\p{Lu}\\p{Ll}+(?:[ -]\\p{Lu}\\p{Ll}+)?|\\p{Lu}[.])"r ;
LIST AA = "<\\p{Lu}\\p{Lu}.*>"r ; # at least two upper case chars in wf
LIST fn = (np ant) ;
LIST ln = (np cog) (np top) (np cgguess) ; # NB: toponyms and guessed names are also considered possible middle/last names
LIST mn = (np ant) (np cog) (np top) ;
LIST mn/init = (np ant) (np cog) (np top) (np init) ;
LIST ln/init = (np cog) (np top) (np cgguess) (np init) ; # NB: toponyms and guessed names are also considered possible middle/last names
LIST init = (np init) ;
LIST org = (np org) ;
LIST aa = /^\([Aa]+\)$/r ; # TODO: no longer needed, now we use * to copy tags from first word
LIST syn = /^\(@.*\)$/r ;
LIST gen = gen ;
LIST np = np ;
LIST _ = (*) ;
# We can merge up to three lemmas in a row, but if incoming cohorts
# already have spaces we might end up with longer sequences than .dix
# can handle. The three-lemma rules should at most allow one of the
# cohorts to have spaces; the below TEMPLATE will match if two of them
# have spaces:
LIST hasSpace = ".* .*"r ;
TEMPLATE twoSpaces =
([ hasSpace, hasSpace, _ ])
OR ([ _ , hasSpace, hasSpace ])
OR ([ hasSpace, _ , hasSpace ])
;
# Never merge these toponyms:
LIST top-exceptions = "Norge" "Noreg" "Japan" "USA" "Sverige" "Danmark" "Finland" "Island" "Storbritannia"
"India" "Pakistan" "Syria" "Tyskland" "Russland" "Ukraina"
;
AFTER-SECTIONS # only run once
PROTECT np IF (0 top-exceptions) ;
PROTECT np IF (1 top-exceptions) ;
PROTECT np IF (2 top-exceptions) ;
# First genitives:
MERGECOHORTS:f+m+lG ("<$1 $3 $5>"v "$2 $4 $6"v cgguess gen * VSTR:$8)
wf + bf + fn
(NEGATE 0/1 (*)) (NEGATE 0 gen)
(NEGATE 1/1 (*)) (NEGATE 1 gen) (NEGATE 1 AA)
(NEGATE 2/1 (*)) (2 gen) (NEGATE 2 AA)
(NEGATE 0 T:twoSpaces)
WITH
(1 wf + bf + mn/init)
(2 wf + bf + ln + aa + syn)
;
MERGECOHORTS:m+lG ("<$1 $3>"v "$2 $4"v cgguess gen * VSTR:$6)
wf + bf + mn
(NEGATE 0/1 (*)) (NEGATE 0 gen)
(NEGATE 1/1 (*)) (1 gen) (NEGATE 1 AA)
WITH
(1 wf + bf + ln + aa + syn)
;
# Then non-genitives:
# For these we allow compounds, and just use form as lemma (since compound lemmas are split):
MERGECOHORTS:f+i+m+l ("<$1 $3 $5 $7>"v "$1 $3 $5 $7"v cgguess * VSTR:$9)
wf + bf + fn
(NEGATE 0 gen)
(NEGATE 1 gen) (NEGATE 1 AA)
(NEGATE 2 gen) (NEGATE 2 AA)
(NEGATE 3 gen) (NEGATE 3 AA)
(NEGATE 0 T:twoSpaces)
WITH
(1 wf + bf + init)
(2 wf + bf + mn/init)
(3 wf + bf + ln + syn)
; # We can handle four in a row if second is initial
MERGECOHORTS:f+m+l ("<$1 $3 $5>"v "$1 $3 $5"v cgguess * VSTR:$8)
wf + bf + fn
(NEGATE 0 gen)
(NEGATE 1 gen) (NEGATE 1 AA)
(NEGATE 2 gen) (NEGATE 2 AA)
(NEGATE 0 T:twoSpaces)
WITH
(1 wf + bf + mn/init)
(2 wf + bf + ln + aa + syn)
;
MERGECOHORTS:m+l ("<$1 $3>"v "$1 $3"v cgguess * VSTR:$6)
wf + bf + mn
(NEGATE 0 gen)
(NEGATE 1 gen) (NEGATE 1 AA)
WITH
(1 wf + bf + ln/init + aa + syn)
;
MERGECOHORTS:org+org ("<$1 $3>"v "$1 $3"v cgguess * VSTR:$6)
wf + bf + org
(NEGATE 0 gen)
(NEGATE 1 gen) (NEGATE 1 AA)
WITH
(1 wf + bf + org + aa + syn)
;
REMOVE:1@ (@subst→) ;
SUBSTITUTE:gen (cgguess gen np ant) (np ant cgguess gen) (cgguess gen np ant) ;
SUBSTITUTE:gen (cgguess gen np ant m) (np ant cgguess m gen) (cgguess gen np ant m) ;
SUBSTITUTE:gen (cgguess gen np ant f) (np ant cgguess f gen) (cgguess gen np ant f) ;
SUBSTITUTE:gen (cgguess gen np ant nt) (np ant cgguess nt gen) (cgguess gen np ant nt) ;
SUBSTITUTE:nom (cgguess np ant) (np ant cgguess ) (cgguess np ant) - (gen) ;
SUBSTITUTE:gen (cgguess gen np cog) (np cog cgguess gen) (cgguess gen np cog) ;
SUBSTITUTE:nom (cgguess np cog) (np cog cgguess ) (cgguess np cog) - (gen) ;
SUBSTITUTE:gen (cgguess gen np top) (np top cgguess gen) (cgguess gen np top) ;
SUBSTITUTE:nom (cgguess np top) (np top cgguess ) (cgguess np top) - (gen) ;
SUBSTITUTE:gen (cgguess gen np org) (np org cgguess gen) (cgguess gen np org) ;
SUBSTITUTE:nom (cgguess np org) (np org cgguess ) (cgguess np org) - (gen) ;