From cd92257bb0d63f991094e2cdf2d81b6b5934b510 Mon Sep 17 00:00:00 2001 From: Ben Bonfil Date: Wed, 20 Dec 2023 14:44:38 +0100 Subject: [PATCH] improvements for R-pronouns and relativisation (#10) (#11) Co-authored-by: Jan Odijk --- mwe_query/canonicalform.py | 23 +++++++++++++---- mwe_query/lcat.py | 7 ++++- mwe_query/trymwes.py | 52 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 6 deletions(-) create mode 100644 mwe_query/trymwes.py diff --git a/mwe_query/canonicalform.py b/mwe_query/canonicalform.py index c9e771c..3612cea 100644 --- a/mwe_query/canonicalform.py +++ b/mwe_query/canonicalform.py @@ -386,7 +386,14 @@ def mknewnode(stree, mwetop, atts, annotations): newnode.attrib['maxnodecount'] = f'{len(stree)}' return newnode - +def expandnonheadwordnode(nonheadwordnode, phrasenodeproperties): + phraserel = gav(nonheadwordnode, 'rel') + newnonheadwordnode = copy.copy(nonheadwordnode) + newnonheadwordnode.attrib['rel'] = 'hd' + phrasenode = ET.Element('node', attrib=phrasenodeproperties) + phrasenode.attrib['rel'] = phraserel + phrasenode.append(newnonheadwordnode) + return phrasenode def zullenheadclause(stree: SynTree) -> bool: if stree.tag == 'node': cat = gav(stree, 'cat') @@ -1016,9 +1023,10 @@ def newgenvariants(stree: SynTree) -> List[SynTree]: Rpronounobj1node = copy.copy(obj1node) Rpronounobj1node.attrib['lemma'] = 'er|hier|daar|waar|ergens|nergens|overal' Rpronounobj1node.attrib['pt'] = 'vnw' + newphrase = expandnonheadwordnode(Rpronounobj1node, {}) for child in newppnode2: newppnode2.remove(child) - newppnode2.append(Rpronounobj1node) + newppnode2.append(newphrase) newppnode2.append(newvz2) # pp with R-pronoun object which has been replaced by a full NO with a dummymod @@ -1047,11 +1055,15 @@ def newgenvariants(stree: SynTree) -> List[SynTree]: pppronadvvcnode.append(pronadvnode1) pppronadvvcnode.append(newvcnode) + # pp's with a pronominal adverb. e.g. daarnaar pprel = gav(ppnode, 'rel') pronadvnode = getpronadv(vzlemma, pprel) + pronadvppnode = expandnonheadwordnode(pronadvnode, {'cat': 'pp', 'rel': pprel}) + pronadvnode.attrib['rel'] = 'hd' + pronadvppnode.append(pronadvnode) alternativesnode = mkalternativesnode([[ppnode], [newppnode2], [newppnode3], [ - pppobj1vcnode], [pppronadvvcnode], [pronadvnode]]) + pppobj1vcnode], [pppronadvvcnode], [pronadvppnode]]) parent.append(alternativesnode) vblgennpnodeids = newstree.xpath( @@ -1404,8 +1416,9 @@ def relpronsubst(stree: SynTree) -> SynTree: def expandfull(stree: SynTree) -> SynTree: # possibly add getlcat stree1 = relpronsubst(stree) - stree2 = indextransform(stree1) - return stree2 + stree2 = expandnonheadwords(stree1) + stree3 = indextransform(stree2) + return stree3 def gettopnode(stree): diff --git a/mwe_query/lcat.py b/mwe_query/lcat.py index 82ccf4c..76117f5 100644 --- a/mwe_query/lcat.py +++ b/mwe_query/lcat.py @@ -8,6 +8,8 @@ import copy import lxml.etree as ET +dummy = 'dummy' + def expandnonheadwords(stree: SynTree) -> SynTree: # it is presupposed that the input stree is not None @@ -47,7 +49,8 @@ def getlcatatt(node: SynTree) -> str: def mkphrase(child: SynTree) -> SynTree: newnode = ET.Element('node') - newnode.attrib['id'] = child.attrib['id'] + 'a' + if 'íd' in child.attrib: + newnode.attrib['id'] = child.attrib['id'] + 'a' lcat = getlcatatt(child) if lcat in validcats: newnode.attrib['cat'] = lcat @@ -176,6 +179,8 @@ def getlcat(node: SynTree, prel=None) -> str: # noqa: C901 result = 'np' elif pt == 'spec': result = None + elif pt == dummy: + result = None else: print('Unknown att value (pt) encountered in:') ET.dump(node) diff --git a/mwe_query/trymwes.py b/mwe_query/trymwes.py new file mode 100644 index 0000000..3962fc2 --- /dev/null +++ b/mwe_query/trymwes.py @@ -0,0 +1,52 @@ +from sastadev.alpinoparsing import parse +from lcat import expandnonheadwords +from sastadev.treebankfunctions import indextransform +from lxml import etree +from canonicalform import generatequeries, expandfull + +debug = False + +geenhaankraaien = ('0geen *haan zal naar iets kraaien', + ['Daar kraait geen haan naar', 'Hier heeft geen haan naar gekraaid', + 'geen haan kraaide daarnaar', 'geen haan kraaide ernaar dat hij niet kwam', + 'geen haan kraaide er naar dat hij niet kwam', + 'er is geen haan die daar naar kraait', ] + ) + +def select(mweutts, utt=None): + if utt is None: + result = mweutts + else: + result = (mweutts[0], [mweutts[1][utt]]) + return result + +def getparses(utterances): + uttparses = [] + for utterance in utterances: + uttparse = parse(utterance) + uttparses.append(uttparse) + return uttparses + +def trysomemwes(): + mwe, utterances = select(geenhaankraaien) + mwequeries = generatequeries(mwe) + labeledmwequeries = (('MWEQ', mwequeries[0]), ('NMQ', mwequeries[1]), ('MLQ', mwequeries[2])) + uttparses = getparses(utterances) + for utterance, uttparse in zip(utterances, uttparses): + print(f'{utterance}:') + expandeduttparse = expandfull(uttparse) + if debug: + etree.dump(expandeduttparse) + for label, mwequery in labeledmwequeries: + results = expandeduttparse.xpath(mwequery) + if debug: + print('Found hits:') + for result in results: + etree.dump(result) + print(f'{label}: {len(results)}') + + + + +if __name__ == '__main__': + trysomemwes() \ No newline at end of file