-
Notifications
You must be signed in to change notification settings - Fork 0
/
export-example-corpus-vot.xq
72 lines (59 loc) · 1.79 KB
/
export-example-corpus-vot.xq
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import module namespace keeleleek ="http://vadja.keeleleek.ee" at "eelexify-module.xqm";
declare namespace vot = "http://www.eki.ee/dict/vot";
declare option output:method "xml";
declare option output:indent "no";
declare option output:omit-xml-declaration "yes";
(: Export to Korp with Giellatekno tags
recursive typeswitch pattern from https://en.m.wikibooks.org/wiki/XQuery/Typeswitch_Transformations
:)
declare function local:export-to-giellatekno-vrt($nodes as node()*)
{
for $node in $nodes
return
typeswitch ($node)
case (element(w))
return
concat(
out:nl(),
(: 1) token :)
$node/text(),
(: 2) lemma+morphemes :)
if (exists($node/@lemma)) then (out:tab() || $node/@lemma) else (),
if (exists($node/@pos)) then (" //_" || $node/@pos || "_ ") else (),
if (exists($node/@analysis)) then ($node/@analysis || ", //") else ()
)
case (element(*))
return
(
out:nl(), (: add a newline :)
element {name($node)} {(
$node/@*, (: pass through all attributes :)
for $child in $node/node()
return local:export-to-giellatekno-vrt($child)
,out:nl()
)}
)
default
return
()
};
let $corpus :=
<corpus title="Vadja keele sõnaraamat (2013)">
<text title="Vadja näitelaused">
{
for $example in distinct-values(db:open($keeleleek:db-name)//vot:näitelause/text())
let $tokens := analyze-string(
normalize-space($example),
'(\.\.\.)|\s|[.,…?!:;]'
)//text()[not(.=" ")]
return
<s>
{
for $token in $tokens
return <w>{$token}</w>
}
</s>
}
</text>
</corpus>
return local:export-to-giellatekno-vrt($corpus)