From 5551a7ec2ea80b93c81f2bf2c21fc2c4e91d7608 Mon Sep 17 00:00:00 2001 From: Franck Bodmer Date: Thu, 19 Oct 2023 10:01:59 +0200 Subject: [PATCH] Issue #66: REG: missing #REG-Operator implemented: rebased. Added new tests. Moved general purpose methods to StringUtils.java. Change-Id: I42f12251a73511fff07b48e06f6018ba1e181433 Reviewed-on: https://korap.ids-mannheim.de/gerrit/c/KorAP/Koral/+/7658 Reviewed-by: Nils Diewald --- pom.xml | 1 + src/main/antlr/cosmas/c2ps.g | 34 ++- .../korap/query/parse/cosmas/c2ps_opBED.java | 5 +- .../korap/query/parse/cosmas/c2ps_opREG.java | 235 ++++++++++++++++++ .../serialize/Cosmas2QueryProcessor.java | 156 ++++++++++-- .../query/serialize/QuerySerializer.java | 13 +- .../ids_mannheim/korap/util/StringUtils.java | 157 ++++++++++++ .../cosmas2/Cosmas2QueryProcessorTest.java | 225 ++++++++++++++++- 8 files changed, 795 insertions(+), 31 deletions(-) create mode 100644 src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java create mode 100644 src/main/java/de/ids_mannheim/korap/util/StringUtils.java diff --git a/pom.xml b/pom.xml index 6bdd5574..8d22bf5f 100644 --- a/pom.xml +++ b/pom.xml @@ -177,6 +177,7 @@ **/c2ps_opIN.java **/c2ps_opOV.java **/c2ps_opPROX.java + **/c2ps_opREG.java **/c2ps_opWF.java **/c2ps_optCase.java **/.gitignore diff --git a/src/main/antlr/cosmas/c2ps.g b/src/main/antlr/cosmas/c2ps.g index c264ea63..8908a494 100644 --- a/src/main/antlr/cosmas/c2ps.g +++ b/src/main/antlr/cosmas/c2ps.g @@ -1,16 +1,20 @@ - // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * -// // -// COSMAS II zeilenorientierten Suchanfragesprache (C2 plain syntax) // -// globale Grammatik (ruft lokale c2ps_x.g Grammatiken auf). // -// 17.12.12/FB // -// v-0.6 // -// TODO: // -// - se1: Einsetzen des Default-Operators in den kumulierten AST. // +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * +// +// COSMAS II zeilenorientierten Suchanfragesprache (C2 plain syntax) +// globale Grammatik (ruft lokale c2ps_x.g Grammatiken auf). +// 17.12.12/FB +// v-0.6 +// TODO: +// - se1: Einsetzen des Default-Operators in den kumulierten AST. +// +// v0.7 - 25.07.23/FB +// - added: #REG(x) // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * grammar c2ps; options { output=AST; backtrack=true; k=5;} +// tokens that will appear as node names in the resulting AST: tokens {C2PQ; OPBED; OPTS; OPBEG; OPEND; OPNHIT; OPALL; OPLEM; OPPROX; ARG1; ARG2; OPWF; OPLEM; OPANNOT; @@ -21,6 +25,7 @@ tokens {C2PQ; OPBED; OPTS; OPBEG; OPEND; OPNHIT; OPALL; OPLEM; OPPROX; OPNOT; OPEXPR1; OPMORPH; OPELEM; + OPREG; } @header {package de.ids_mannheim.korap.query.parse.cosmas;} @@ -76,6 +81,14 @@ OP_IN : '#IN' | '#IN(' OP_IN_OPTS? ')' ; OP_OV : '#OV' | '#OV(' OP_OV_OPTS? ')' ; +// #REG(abc['"]) or #REG('abc\'s') or #REG("abc\"s"): + +OP_REG : '#REG(' ' '* '\'' ('\\\''|~'\'')+ '\'' (' ')* ')' + | + '#REG(' ' '* '"' ('\\"'|~'"')+ '"' (' ')* ')' + | + '#REG(' ' '* ~('\''|'"'|' ') (~(')'))* ')'; + // EAVEXP wird hier eingesetzt für eine beliebige Sequenz von Zeichen bis zu ')'. fragment OP_IN_OPTS : EAVEXPR ; @@ -241,7 +254,7 @@ opNOT : ('nicht' | 'NICHT' | 'not' | 'NOT') -> ^(OPNOT); // OP1: Suchoperatoren mit 1 Argument: // ----------------------------------- -op1 : opBEG | opEND | opNHIT | opALL | opBED; +op1 : opBEG | opEND | opNHIT | opALL | opBED | opREG; // #BED(serchExpr, B). // B muss nachträglich in einer lokalen Grammatik überprüft werden. @@ -259,3 +272,6 @@ opEND : ( '#END(' | '#RECHTS(' ) searchExpr ')' -> ^(OPEND searchExpr) ; opNHIT : ( '#NHIT(' | '#INKLUSIVE(' ) searchExpr ')' -> ^(OPNHIT searchExpr) ; opALL : ( '#ALL(' | '#EXKLUSIVE(' ) searchExpr ')' -> ^(OPALL searchExpr) ; + +opREG : OP_REG -> ^(OPREG {c2ps_opREG.encode($OP_REG.text, OPREG)}) ; + diff --git a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java index fb9df4e8..35f64379 100644 --- a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java +++ b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java @@ -17,7 +17,8 @@ public static Tree check (String input, int index) { c2ps_opBEDParser.opBEDOpts_return c2PQReturn = null; /* - System.out.println("check opBED: " + index + ": " + input); + System.out.format("opBED: check: input='%s', index=%d.\n", input, index); + System.out.format("opBED: tokens ='%s'.\n", tokens.toString()); System.out.flush(); */ @@ -68,7 +69,7 @@ public static Tree checkTPos (String input, int index) { public static void main (String args[]) throws Exception { - String[] input = { ",sa,se,-ta,-te/pa,-pe)", ",sa)", ",/pa,-pe)" }; + String[] input = { ",sa,se,-ta,-te/pa,-pe)", ",sa)", ",/pa,-pe)"}; Tree tree; for (int i = 0; i < input.length; i++) { diff --git a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java new file mode 100644 index 00000000..a798647a --- /dev/null +++ b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java @@ -0,0 +1,235 @@ +package de.ids_mannheim.korap.query.parse.cosmas; + +import org.antlr.runtime.*; +import org.antlr.runtime.tree.*; + +import de.ids_mannheim.korap.query.serialize.util.Antlr3DescriptiveErrorListener; +import de.ids_mannheim.korap.util.StringUtils; + +/* + * 1. transforms and encodes a regular COSMAS II like expression #REG(regexpr) + * into a AST tree -> encode(). + * 2. transforms tree into the corresponding Koral:token/Koral:term, like: + * e.g. #REG(abc[']?s) -> + * { + * "@type": "koral:term", + * "match": "match:eq", + * "type" : "type:regex", + * "key" : "abc[']?s", + * "layer": "orth" + * }... + * + * - see doc: http://korap.github.io/Koral/ + * - generation of koral:term -> processOPREG(). + * 06.09.23/FB + */ + +public class c2ps_opREG + +{ + private static boolean DEBUG = false; + + /* + * encode(): + * + * input = e.g. "#REG('abc(d|e)*')" -> return AST = (OPREG abc(d|e)*): + * + * Returned String: no enclosing "..." needed, so no escaping of " nor \ needed. + * 06.09.23/FB + */ + public static Tree encode (String input, int tokenType) + + { + if( DEBUG ) + { + System.out.printf("opREG.encode: input = >>%s<<, token type=%d.\n", input, tokenType); + System.out.flush(); + } + + if( input.substring(0, 5).compareToIgnoreCase("#REG(") != 0 || input.charAt(input.length()-1) != ')' ) + { + // error: '#REG(' and ')' not found: return input unchanged. + if( DEBUG ) System.out.printf("opREG.encode: unexpected input = >>%s<<: nothing encoded!\n", input); + return new CommonTree(new CommonToken(tokenType, input)); + } + + + StringBuffer sb = new StringBuffer(input.substring(5)); + sb.deleteCharAt(sb.length()-1); + + // #REG("a"), #REG(a), #REG('a') -> >>a<<. + // enclosing ".." are appended at the end of this function. + // a. remove blanks around ".." and '..', + // e.g. a. #REG( ' abc ' ) -> #REG(' abc '). + + StringUtils.removeBlanksAtBothSides(sb); + + if( sb.charAt(0) == '\'' || sb.charAt(0) == '"') + { + // remove pairwise at both ends. + sb.deleteCharAt(0); + if( sb.charAt(sb.length()-1) == '\'' || sb.charAt(sb.length()-1) == '"' ) + sb.deleteCharAt(sb.length()-1); + } + + // b. remove blanks inside '..' or "..", + // E.g. #REG(' abc ') -> #REG('abc'): + + StringUtils.removeBlanksAtBothSides(sb); + + /* unescape >>'<<, >>"<< and >>\<<. + * e.g. #REG('that\'s') -> "that\'s" -> >>that's<<. + */ + + for(int i=0; i>%s<<.\n", sb.toString()); + + return new CommonTree(new CommonToken(tokenType, sb.toString())); + + } // encode + + /* + * printTokens: + * Notes: + * - must build a separate CommonTokenStream here, because + * tokens.fill() will consume all tokens. + * - prints to stdout list of tokens from lexer. + * - mainly for debugging. + * 14.09.23/FB + * + */ + + private static void printTokens(String query, Antlr3DescriptiveErrorListener errorListener) + + { + ANTLRStringStream + ss = new ANTLRStringStream(query); + c2psLexer + lex = new c2psLexer(ss); + org.antlr.runtime.CommonTokenStream + tokens = new org.antlr.runtime.CommonTokenStream(lex); // v3 + + lex.setErrorReporter(errorListener); + + // get all tokens from lexer: + tokens.fill(); + + System.out.printf("opREG.check: no. of tokens = %d.\n", tokens.size()); + for(int i=0; i>#REG(\" a"s\")<<. + lex.setErrorReporter(errorListener); + ((c2psParser) g).setErrorReporter(errorListener); + + if( DEBUG ) + { + //System.out.format("opREG.check: input='%s', index=%d.\n", query, index); + printTokens(query, errorListener); + System.out.flush(); + } + + + try { + c2psParser.c2ps_query_return + c2Return = ((c2psParser) g).c2ps_query(); // statt t(). + + // AST Tree anzeigen: + tree = (Tree) c2Return.getTree(); + //if (DEBUG) + // System.out.printf("opREG.check: tree = '%s'.\n", tree.toStringTree()); + } + catch (RecognitionException e) { + System.err.printf("c2po_opREG.check: Recognition Exception!\n"); + } + + return tree; + } // check + + + /** + * main + */ + + public static void main (String args[]) throws Exception + + { + String input[] = { "#REG(abc)", + "#REG(def's)", + "#REG( def's )", // all blanks should be removed. + "#REG( ' def\\'s ' )", // same + "#REG( \" def's \" )", // same + "#REG(abc[\"]ef)", + "#REG('abc')", // ' fehlt: generates Syntax Error . + "#REG('abc\')", // User input = #REG('abc\') : OK, nothing escaped. + "#REG('abc\'')", // User input = #REG('abc\') : OK, nothing escaped. + "#REG('abc\\')", // User input = #REG('abc\') : OK, same behavior: \\ == \. + "#REG((a|b))", // broken input, should use ".." or '..'. + "#REG('(a|b)')", // OK. + "#REG(\"(a|b)\")", // OK. + "#REG(^[A-Z]+abc[\']*ung$)", + "#REG('ab(cd|ef)*')", + "#REG('abc(def|g)*[)(]')", + "#REG(\"abc(def|g)*[)(]\")", + "#REG('abc[\"]')", // User input = #REG('abc["]') : OK, needs escape => #REG("...\"...") + "#REG(\"abc[\\\"]\")", // User input = #REG("abc["]") : broken because of 2nd " -> syntax error. + "#REG(\"abc[\\\"]\")", // User input = #REG("abc[\"]"): OK, already escaped by user => #REG("...\"...") + "#REG(\"abc[\\\\\"]\")" // User input = #REG("abc[\\"]") : broken. with escaped " => #REG("...\"...") + }; + Tree tree; + + for (int i = 0; i < input.length; i++) + { + System.out.printf("c2ps_opREG: Parsing input %02d: >>%s<<\n", i, input[i]); + tree = check(input[i], 0); + System.out.printf("c2ps_opREG: tree %02d: >>%s<<.\n\n", i, tree.toStringTree()); + } + + + } // main + +} diff --git a/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java b/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java index 69a6293a..8bbfa351 100644 --- a/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java +++ b/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java @@ -15,6 +15,7 @@ import de.ids_mannheim.korap.query.serialize.util.KoralObjectGenerator; import de.ids_mannheim.korap.query.serialize.util.ResourceMapper; import de.ids_mannheim.korap.query.serialize.util.StatusCodes; +import de.ids_mannheim.korap.util.StringUtils; import org.antlr.runtime.ANTLRStringStream; import org.antlr.runtime.RecognitionException; @@ -127,7 +128,6 @@ public class Cosmas2QueryProcessor extends Antlr3AbstractQueryProcessor { public static Pattern wildcardPlusPattern = Pattern.compile("([+])"); public static Pattern wildcardQuestionPattern = Pattern.compile("([?])"); - /** * @param tree * The syntax tree as returned by ANTLR @@ -142,6 +142,7 @@ public Cosmas2QueryProcessor (String query) { process(query); if (DEBUG) { log.debug(">>> " + requestMap.get("query") + " <<<"); + System.out.printf("Cosmas2QueryProcessor: >>%s<<.\n", requestMap.get("query")); } } @@ -151,14 +152,19 @@ public void process (String query) { Tree tree = null; tree = parseCosmasQuery(query); if (DEBUG) { + System.out.printf("\nProcessing COSMAS II query: %s.\n\n", query); log.debug("Processing CosmasII query: " + query); } - if (tree != null) { - if (DEBUG) { - log.debug("ANTLR parse tree: " + tree.toStringTree()); - } + if (tree != null) + { + + if (DEBUG) { + log.debug("ANTLR parse tree: " + tree.toStringTree()); + System.out.printf("\nANTLR parse tree: %s.\n\n", tree.toStringTree()); + } + processNode(tree); - } + } } @@ -278,6 +284,11 @@ private void processNode (Tree node) { if (nodeCat.equals("OPBED")) { processOPBED(node); } + + if (nodeCat.equals("OPREG")) { + processOPREG(node); + } + objectsToPop.push(stackedObjects); toWrapsToPop.push(stackedToWrap); @@ -444,6 +455,88 @@ else if (conditionCount < conditionGroups.size()) { } } + /* processOPREG: + * + * - input Node structure is: (OPREG "regexpr"). + * - transforms tree into the corresponding Koral:token/Koral:term, like: + * e.g. #REG(abc[']?s) -> + * { + * "@type": "koral:term", + * "match": "match:eq", // optional + * "type" : "type:regex", + * "key" : "abc[']?s", + * "layer": "orth" + * }. + * + * - see doc: http://korap.github.io/Koral/ + * + * 06.09.23/FB + */ + + private void processOPREG (Tree node) + + { + int + nChild = node.getChildCount() - 1; + Tree + nodeChild = node.getChild(0); + boolean + bDebug = false; + + if( DEBUG ) + { + //System.out.printf("Debug: processOPREG: node='%s' nChilds=%d.\n", node.toStringTree(), nChild+1); + System.out.printf("Debug: processOPREG: child: >>%s<< cat=%s type=%d.\n", + nodeChild.getText(), getNodeCat(node), nodeChild.getType()); + } + + // empty case (is that possible?): + if( nChild < 0 ) + return; + + // see processOPWF_OPWF_OPLEM + // for how to insert regexpr into Koral JSON-LD + + Map + token = KoralObjectGenerator.makeToken(); + + objectStack.push(token); + stackedObjects++; + + Map + fieldMap = KoralObjectGenerator.makeTerm(); + + token.put("wrap", fieldMap); + + // make category-specific fieldMap entry: + /* + System.out.printf("Debug: processOPREG: before replaceALL: >>%s<<.\n", nodeChild.toStringTree()); + String + value = nodeChild.toStringTree().replaceAll("\"", ""); + System.out.printf("Debug: processOPREG: after replaceALL: >>%s<<.\n", value); + */ + + /* replace replaceALL() by replaceIfNotEscaped() to delete every occurence of >>"<< + * which is not escaped by >>\<<, as it is important to keep the escaped sequence for + * the argument of #REG(). + * This is not possible with replaceALL(). + */ + String + value = nodeChild.toStringTree(); // old version: replaceDoubleQuotes(nodeChild.toStringTree()); + + if( bDebug ) + System.out.printf("Debug: processOPREG: key: >>%s<<.\n", value); + + fieldMap.put("key", value); + fieldMap.put("layer", "orth"); + fieldMap.put("type", "type:regex"); + fieldMap.put("match", "match:eq"); + + // decide where to put (objPos=1, not clear why, but it works only like that - 20.09.23/FB): + putIntoSuperObject(token,1); + + } // processOPREG + private void processOPNHIT (Tree node) { Integer[] classRef = new Integer[] { classCounter + 128 + 1, @@ -1511,19 +1604,40 @@ private Map wrap (Map[] wrapCascade) { @SuppressWarnings("unchecked") - private void putIntoSuperObject (Map object, - int objStackPosition) { - if (objectStack.size() > objStackPosition) { + private void putIntoSuperObject (Map object, int objStackPosition) + + { + if( DEBUG ) + { + System.out.printf("Debug: putIntosuperObject(<>,int): objectStack.size=%d objStackPos=%d object=%s.\n", + objectStack.size(), objStackPosition, object == null ? "null" : "not null"); + + if( objectStack != null && objectStack.size() > 0 ) + System.out.printf("Debug: putIntosuperObject: objectStack = %s.\n", objectStack.toString()); + + if( invertedOperandsLists != null ) + System.out.printf("Debug: putIntosuperObject: invertedOperandsLists: [%s].\n", invertedOperandsLists.toString()); + } + + + if (objectStack.size() > objStackPosition) + { ArrayList topObjectOperands = - (ArrayList) objectStack.get(objStackPosition) - .get("operands"); - if (!invertedOperandsLists.contains(topObjectOperands)) { + (ArrayList) objectStack.get(objStackPosition).get("operands"); + + if( DEBUG ) + System.out.printf("Debug: putIntosuperObject: topObjectOperands = [%s].\n", topObjectOperands == null ? "null" : "not null"); + + objectStack.get(objStackPosition); + + if (!invertedOperandsLists.contains(topObjectOperands)) + { topObjectOperands.add(object); - } + } else { topObjectOperands.add(0, object); - } - } + } + } else { requestMap.put("query", object); } @@ -1618,7 +1732,8 @@ private Map termToFieldMap (String term) { private Tree parseCosmasQuery (String query) { - query = rewritePositionQuery(query); + + query = rewritePositionQuery(query); Tree tree = null; Antlr3DescriptiveErrorListener errorListener = new Antlr3DescriptiveErrorListener(query); @@ -1627,16 +1742,23 @@ private Tree parseCosmasQuery (String query) { c2psLexer lex = new c2psLexer(ss); org.antlr.runtime.CommonTokenStream tokens = new org.antlr.runtime.CommonTokenStream(lex); // v3 + + // System.out.printf("parseCosmasQuery: tokens = %d\n", tokens.size()); + // System.out.printf("parseCosmasQuery: tokens = %s\n", tokens.toString()); + parser = new c2psParser(tokens); + // Use custom error reporters lex.setErrorReporter(errorListener); ((c2psParser) parser).setErrorReporter(errorListener); + c2psParser.c2ps_query_return c2Return = ((c2psParser) parser).c2ps_query(); // statt t(). + // AST Tree anzeigen: tree = (Tree) c2Return.getTree(); if (DEBUG) log.debug(tree.toStringTree()); - } + } catch (RecognitionException e) { log.error( "Could not parse query. Please make sure it is well-formed."); diff --git a/src/main/java/de/ids_mannheim/korap/query/serialize/QuerySerializer.java b/src/main/java/de/ids_mannheim/korap/query/serialize/QuerySerializer.java index 8294dca2..94bf15dc 100644 --- a/src/main/java/de/ids_mannheim/korap/query/serialize/QuerySerializer.java +++ b/src/main/java/de/ids_mannheim/korap/query/serialize/QuerySerializer.java @@ -73,7 +73,9 @@ public class QuerySerializer { private List errors; private List warnings; private List messages; - + + private boolean DEBUG = false; + public QuerySerializer () { this.errors = new ArrayList<>(); this.warnings = new ArrayList<>(); @@ -102,6 +104,8 @@ public static void main (String[] args) { int i = 0; String[] queries = null; String ql = "poliqarpplus"; + boolean bDebug = true; + if (args.length < 2) { System.err .println("Usage: QuerySerializer \"query\" queryLanguage"); @@ -114,7 +118,9 @@ public static void main (String[] args) { for (String q : queries) { i++; try { - jg.run(q, ql); + if( bDebug ) System.out.printf("QuerySerialize: query = >>%s<< lang = %s.\n", q, ql); + + jg.run(q, ql); System.out.println(); } catch (NullPointerException npe) { @@ -140,6 +146,9 @@ public static void main (String[] args) { * @throws IOException */ public void run (String query, String queryLanguage) throws IOException { + + ast.verbose = DEBUG ? true : false; // debugging: 01.09.23/FB + if (queryLanguage.equalsIgnoreCase("poliqarp")) { ast = new PoliqarpPlusQueryProcessor(query); } diff --git a/src/main/java/de/ids_mannheim/korap/util/StringUtils.java b/src/main/java/de/ids_mannheim/korap/util/StringUtils.java new file mode 100644 index 00000000..29410d18 --- /dev/null +++ b/src/main/java/de/ids_mannheim/korap/util/StringUtils.java @@ -0,0 +1,157 @@ +package de.ids_mannheim.korap.util; + +/* general String manipulation functions moved + * from de.ids_mannheim.de.korap.query.parse.cosmas.c2ps_opREG.java and Cosmas2QueryProcessor.java. + * 24.10.23/FB + */ + +public final class StringUtils { + + private static final boolean DEBUG = false; + + /** + * replaceIfNotEscaped: + * - kind of adhoc alternative to String.replaceAll(). + * - replaces every occurence of >>"<< in buf IF it isn't escaped by >>\<<. + * Notes: + * - first intention: replace String.replaceALL() in processOPREG() because + * replaceALL() cannot be used in that special case. + * Returns the replaced string. + * 25.09.23/FB + */ + + public static String replaceIfNotEscaped(String buf) + + { + StringBuffer + sb = new StringBuffer(buf); + + for(int i=0; i>"<< for #REG(expr) + * instead of String.replaceAll(). + * - replaces every occurence of >>"<< in buf that is not escaped by >>\<<. + * - If the >>"<< is escaped, the escape char is removed: >>\"<< -> >>"<<. + * Notes: + * - the converted string is intented to be greped. + * E.g.: + * - >>"\"Abend\"-Ticket"<< -> >>"Abend"-Ticket<<. + * Returns the replaced string. + * 26.09.23/FB + */ + + public static String replaceDoubleQuotes(String buf) + + { + StringBuffer + sb = new StringBuffer(buf); + + if( DEBUG ) System.out.printf("replaceDoubleQuotes: input: >>%s<<.\n", buf); + + for(int i=0; i>\"<< -> >>"<<. + sb.deleteCharAt(i); + else if( sb.codePointAt(i+1) == '\\' ) // >>\\<< unchanged. + i++; // keep >>\\<< unchanged. + } + } + else if( sb.codePointAt(i) == '"' ) + { + sb.deleteCharAt(i); // unescaped >>"<< is removed. + i--; + } + } + + if( DEBUG ) System.out.printf("replaceDoubleQuotes: output: >>%s<<.\n", sb.toString()); + + return sb.toString(); + + } // replaceDoubleQuotes + + /* encode2DoubleQuoted: + * transforms an unquoted string into an double quoted string + * and escapes >>"<< and >>/<<. + * E.g. >>.."..<< -> >>"..\".."<<. + * E.g. >>..\..<< -> >>"..\\.."<<. + * E.g. >>..\"..<< -> >>"..\\\".."<<, etc. + * + * escaping >>"<< and >>\<<, because they will be + * enclosed in >>"..."<<. + * >>"<< -> >>\"<< + * >>\<< -> >>\\<< + * + * 28.09.23/FB + * + * E.g. from previous, olddated version: + * \\" -> \\\" + * \\\" -> \\\" + */ + + public static void encode2DoubleQuoted(StringBuffer sb) + + { + if( DEBUG ) System.out.printf("encode2DoubleQuoted: input = >>%s<<.\n", sb.toString()); + + for(int i=0; i>%s<<.\n", sb.toString()); + } // encode2DoubleQuoted + + /* + * removeBlanksAtBothSides + * 28.09.23/FB + */ + + public static void removeBlanksAtBothSides(StringBuffer sb) + + { + int len; + + // remove leading blanks: >> abc << -> >>abc <<: + while( sb.length() > 0 && sb.charAt(0) == ' ') + sb.deleteCharAt(0); + + // remove trailing blanks: >>abc << -> >>abc<<: + while( (len=sb.length()) > 0 && sb.charAt(len-1) == ' ' ) + sb.deleteCharAt(len-1); + + } // removeBlanksAtBothSides + +} diff --git a/src/test/java/de/ids_mannheim/korap/test/cosmas2/Cosmas2QueryProcessorTest.java b/src/test/java/de/ids_mannheim/korap/test/cosmas2/Cosmas2QueryProcessorTest.java index 0722c9b8..759810f9 100644 --- a/src/test/java/de/ids_mannheim/korap/test/cosmas2/Cosmas2QueryProcessorTest.java +++ b/src/test/java/de/ids_mannheim/korap/test/cosmas2/Cosmas2QueryProcessorTest.java @@ -14,12 +14,15 @@ import static org.junit.Assert.*; +import static de.ids_mannheim.korap.query.parse.cosmas.c2ps_opREG.*; +import de.ids_mannheim.korap.util.StringUtils; /** * Tests for JSON-LD serialization of Cosmas II queries. * * @author Joachim Bingel (bingel@ids-mannheim.de) * @author Nils Diewald - * @version 1.1 + * @author Franck Bodmer + * @version 1.2 - 21.09.23 */ public class Cosmas2QueryProcessorTest { @@ -1702,4 +1705,224 @@ public void testMultipleParenthesis () throws JsonProcessingException, IOExcepti assertEquals("s", res.at("/query/distances/0/key").asText()); assertEquals("operation:sequence", res.at("/query/operation").asText()); } + + /* Testing #REG(expr), #REG('expr') and #REG("expr"). + * 21.09.23/FB + */ + + @Test + public void testREG () throws JsonProcessingException, IOException { + + boolean debug = false; + + query = "#REG(^aber$)"; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText()); + assertEquals("koral:token", res.at("/query/@type").asText()); + assertEquals("koral:term", res.at("/query/wrap/@type").asText()); + assertEquals("^aber$", res.at("/query/wrap/key").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("match:eq", res.at("/query/wrap/match").asText()); + + query = "#REG('été\\'')"; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText()); + assertEquals("été'" , res.at("/query/wrap/key").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + + query = "#REG('été\' )"; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText()); + assertEquals("été" , res.at("/query/wrap/key").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + + query = "#REG('été\\')"; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText()); + assertEquals("été\\", res.at("/query/wrap/key").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + + query = "#REG(l'été)"; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText()); + assertEquals("l'été", res.at("/query/wrap/key").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + + query = "#REG(l\\'été)"; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText()); + assertEquals("l'été", res.at("/query/wrap/key").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + + query = "#REG(\"l'été\")"; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText()); + assertEquals("l'été", res.at("/query/wrap/key").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + + query = "#REG(\"l\\'été\")"; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText()); + assertEquals("l'été", res.at("/query/wrap/key").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + + query = "#REG('l\\'été.*')"; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText()); + assertEquals("l'été.*", res.at("/query/wrap/key").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + + query = "#REG('\\\"été\\\"$')"; // means user input is #REG('\"été\"'). + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText()); + assertEquals("\"été\"$", res.at("/query/wrap/key").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + + // checks the >>"<<: + query = "#REG(\\\"Abend\\\"-Ticket)"; // means user input = #REG(\"Abend\"-Ticket). + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText()); + assertEquals("\"Abend\"-Ticket",res.at("/query/wrap/key").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + + query = "#REG('\\\"Abend\\\"-Ticket')"; // means user input = #REG(\"Abend\"-Ticket). + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText()); + assertEquals("\"Abend\"-Ticket",res.at("/query/wrap/key").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + + query = "#REG('\"Abend\"-Ticket')"; // means user input = #REG('"Abend"-Ticket'). + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText()); + assertEquals("\"Abend\"-Ticket",res.at("/query/wrap/key").asText()); // key must be escaped, because converted to in "...". + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + + query = "#REG(\"\\\"Abend\\\"-Ticket\")"; // means user input = #REG("\"Abend\"-Ticket") -> key: >>"Abend"-Ticket<<. + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText()); + assertEquals("\"Abend\"-Ticket",res.at("/query/wrap/key").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + // + + query = "#REG('^(a|b)?+*$')"; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + assertEquals("^(a|b)?+*$", res.at("/query/wrap/key").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + + query = "#REG(\"[A-Z()]\")"; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + assertEquals("[A-Z()]", res.at("/query/wrap/key").asText()); + assertEquals("orth", res.at("/query/wrap/layer").asText()); + assertEquals("type:regex", res.at("/query/wrap/type").asText()); + + query = "#REG(^klein.*) /s0 #REG(A.*ung)"; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + //System.out.printf("Debug: res: pretty: %s.\n", res.toPrettyString()); + + assertEquals("^klein.*", res.at("/query/operands/0/operands/0/wrap/key").asText()); + assertEquals("orth", res.at("/query/operands/0/operands/0/wrap/layer").asText()); + assertEquals("type:regex", res.at("/query/operands/0/operands/0/wrap/type").asText()); + + assertEquals("A.*ung", res.at("/query/operands/1/operands/0/wrap/key").asText()); + assertEquals("orth", res.at("/query/operands/1/operands/0/wrap/layer").asText()); + assertEquals("type:regex", res.at("/query/operands/1/operands/0/wrap/type").asText()); + + query = "#REG( ) "; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + assertTrue(res.toString().contains("Failing to parse")); + + query = "#REG('' ) "; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + assertTrue(res.toString().contains("Failing to parse")); + + query = "#REG(\"\") "; + qs.setQuery(query, "cosmas2"); + res = mapper.readTree(qs.toJSON()); + + assertTrue(res.toString().contains("Failing to parse")); + + } + + @Test + public void testREGencode2DoubleQuoted () { + StringBuffer sb = new StringBuffer("..\".."); + StringUtils.encode2DoubleQuoted(sb); + assertEquals("\"..\\\"..\"",sb.toString()); + + sb = new StringBuffer("..\\.."); + StringUtils.encode2DoubleQuoted(sb); + assertEquals("\"..\\\\..\"", sb.toString()); + + sb = new StringBuffer("..\".."); + StringUtils.encode2DoubleQuoted(sb); + assertEquals("\"..\\\"..\"", sb.toString()); + } + + @Test + public void testREGremoveBlanksAtBothSides () { + StringBuffer sb = new StringBuffer(" aabc cjs ss "); + StringUtils.removeBlanksAtBothSides(sb); + assertEquals("aabc cjs ss",sb.toString()); + + sb = new StringBuffer("abc "); + StringUtils.removeBlanksAtBothSides(sb); + assertEquals("abc",sb.toString()); + + sb = new StringBuffer(" abc"); + StringUtils.removeBlanksAtBothSides(sb); + assertEquals("abc",sb.toString()); + } }