diff --git a/pom.xml b/pom.xml
index 29c4fcce..9e784ab0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -177,6 +177,7 @@
**/c2ps_opIN.java
**/c2ps_opOV.java
**/c2ps_opPROX.java
+ **/c2ps_opREG.java
**/c2ps_opWF.java
**/c2ps_optCase.java
**/.gitignore
diff --git a/src/main/antlr/cosmas/c2ps.g b/src/main/antlr/cosmas/c2ps.g
index c264ea63..8908a494 100644
--- a/src/main/antlr/cosmas/c2ps.g
+++ b/src/main/antlr/cosmas/c2ps.g
@@ -1,16 +1,20 @@
- // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
-// //
-// COSMAS II zeilenorientierten Suchanfragesprache (C2 plain syntax) //
-// globale Grammatik (ruft lokale c2ps_x.g Grammatiken auf). //
-// 17.12.12/FB //
-// v-0.6 //
-// TODO: //
-// - se1: Einsetzen des Default-Operators in den kumulierten AST. //
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+//
+// COSMAS II zeilenorientierten Suchanfragesprache (C2 plain syntax)
+// globale Grammatik (ruft lokale c2ps_x.g Grammatiken auf).
+// 17.12.12/FB
+// v-0.6
+// TODO:
+// - se1: Einsetzen des Default-Operators in den kumulierten AST.
+//
+// v0.7 - 25.07.23/FB
+// - added: #REG(x)
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
grammar c2ps;
options { output=AST; backtrack=true; k=5;}
+// tokens that will appear as node names in the resulting AST:
tokens {C2PQ; OPBED; OPTS; OPBEG; OPEND; OPNHIT; OPALL; OPLEM; OPPROX;
ARG1; ARG2;
OPWF; OPLEM; OPANNOT;
@@ -21,6 +25,7 @@ tokens {C2PQ; OPBED; OPTS; OPBEG; OPEND; OPNHIT; OPALL; OPLEM; OPPROX;
OPNOT;
OPEXPR1;
OPMORPH; OPELEM;
+ OPREG;
}
@header {package de.ids_mannheim.korap.query.parse.cosmas;}
@@ -76,6 +81,14 @@ OP_IN : '#IN' | '#IN(' OP_IN_OPTS? ')' ;
OP_OV : '#OV' | '#OV(' OP_OV_OPTS? ')' ;
+// #REG(abc['"]) or #REG('abc\'s') or #REG("abc\"s"):
+
+OP_REG : '#REG(' ' '* '\'' ('\\\''|~'\'')+ '\'' (' ')* ')'
+ |
+ '#REG(' ' '* '"' ('\\"'|~'"')+ '"' (' ')* ')'
+ |
+ '#REG(' ' '* ~('\''|'"'|' ') (~(')'))* ')';
+
// EAVEXP wird hier eingesetzt für eine beliebige Sequenz von Zeichen bis zu ')'.
fragment OP_IN_OPTS
: EAVEXPR ;
@@ -241,7 +254,7 @@ opNOT : ('nicht' | 'NICHT' | 'not' | 'NOT') -> ^(OPNOT);
// OP1: Suchoperatoren mit 1 Argument:
// -----------------------------------
-op1 : opBEG | opEND | opNHIT | opALL | opBED;
+op1 : opBEG | opEND | opNHIT | opALL | opBED | opREG;
// #BED(serchExpr, B).
// B muss nachträglich in einer lokalen Grammatik überprüft werden.
@@ -259,3 +272,6 @@ opEND : ( '#END(' | '#RECHTS(' ) searchExpr ')' -> ^(OPEND searchExpr) ;
opNHIT : ( '#NHIT(' | '#INKLUSIVE(' ) searchExpr ')' -> ^(OPNHIT searchExpr) ;
opALL : ( '#ALL(' | '#EXKLUSIVE(' ) searchExpr ')' -> ^(OPALL searchExpr) ;
+
+opREG : OP_REG -> ^(OPREG {c2ps_opREG.encode($OP_REG.text, OPREG)}) ;
+
diff --git a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java
index fb9df4e8..35f64379 100644
--- a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java
+++ b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java
@@ -17,7 +17,8 @@ public static Tree check (String input, int index) {
c2ps_opBEDParser.opBEDOpts_return c2PQReturn = null;
/*
- System.out.println("check opBED: " + index + ": " + input);
+ System.out.format("opBED: check: input='%s', index=%d.\n", input, index);
+ System.out.format("opBED: tokens ='%s'.\n", tokens.toString());
System.out.flush();
*/
@@ -68,7 +69,7 @@ public static Tree checkTPos (String input, int index) {
public static void main (String args[]) throws Exception {
- String[] input = { ",sa,se,-ta,-te/pa,-pe)", ",sa)", ",/pa,-pe)" };
+ String[] input = { ",sa,se,-ta,-te/pa,-pe)", ",sa)", ",/pa,-pe)"};
Tree tree;
for (int i = 0; i < input.length; i++) {
diff --git a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java
new file mode 100644
index 00000000..a798647a
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java
@@ -0,0 +1,235 @@
+package de.ids_mannheim.korap.query.parse.cosmas;
+
+import org.antlr.runtime.*;
+import org.antlr.runtime.tree.*;
+
+import de.ids_mannheim.korap.query.serialize.util.Antlr3DescriptiveErrorListener;
+import de.ids_mannheim.korap.util.StringUtils;
+
+/*
+ * 1. transforms and encodes a regular COSMAS II like expression #REG(regexpr)
+ * into a AST tree -> encode().
+ * 2. transforms tree into the corresponding Koral:token/Koral:term, like:
+ * e.g. #REG(abc[']?s) ->
+ * {
+ * "@type": "koral:term",
+ * "match": "match:eq",
+ * "type" : "type:regex",
+ * "key" : "abc[']?s",
+ * "layer": "orth"
+ * }...
+ *
+ * - see doc: http://korap.github.io/Koral/
+ * - generation of koral:term -> processOPREG().
+ * 06.09.23/FB
+ */
+
+public class c2ps_opREG
+
+{
+ private static boolean DEBUG = false;
+
+ /*
+ * encode():
+ *
+ * input = e.g. "#REG('abc(d|e)*')" -> return AST = (OPREG abc(d|e)*):
+ *
+ * Returned String: no enclosing "..." needed, so no escaping of " nor \ needed.
+ * 06.09.23/FB
+ */
+ public static Tree encode (String input, int tokenType)
+
+ {
+ if( DEBUG )
+ {
+ System.out.printf("opREG.encode: input = >>%s<<, token type=%d.\n", input, tokenType);
+ System.out.flush();
+ }
+
+ if( input.substring(0, 5).compareToIgnoreCase("#REG(") != 0 || input.charAt(input.length()-1) != ')' )
+ {
+ // error: '#REG(' and ')' not found: return input unchanged.
+ if( DEBUG ) System.out.printf("opREG.encode: unexpected input = >>%s<<: nothing encoded!\n", input);
+ return new CommonTree(new CommonToken(tokenType, input));
+ }
+
+
+ StringBuffer sb = new StringBuffer(input.substring(5));
+ sb.deleteCharAt(sb.length()-1);
+
+ // #REG("a"), #REG(a), #REG('a') -> >>a<<.
+ // enclosing ".." are appended at the end of this function.
+ // a. remove blanks around ".." and '..',
+ // e.g. a. #REG( ' abc ' ) -> #REG(' abc ').
+
+ StringUtils.removeBlanksAtBothSides(sb);
+
+ if( sb.charAt(0) == '\'' || sb.charAt(0) == '"')
+ {
+ // remove pairwise at both ends.
+ sb.deleteCharAt(0);
+ if( sb.charAt(sb.length()-1) == '\'' || sb.charAt(sb.length()-1) == '"' )
+ sb.deleteCharAt(sb.length()-1);
+ }
+
+ // b. remove blanks inside '..' or "..",
+ // E.g. #REG(' abc ') -> #REG('abc'):
+
+ StringUtils.removeBlanksAtBothSides(sb);
+
+ /* unescape >>'<<, >>"<< and >>\<<.
+ * e.g. #REG('that\'s') -> "that\'s" -> >>that's<<.
+ */
+
+ for(int i=0; i>%s<<.\n", sb.toString());
+
+ return new CommonTree(new CommonToken(tokenType, sb.toString()));
+
+ } // encode
+
+ /*
+ * printTokens:
+ * Notes:
+ * - must build a separate CommonTokenStream here, because
+ * tokens.fill() will consume all tokens.
+ * - prints to stdout list of tokens from lexer.
+ * - mainly for debugging.
+ * 14.09.23/FB
+ *
+ */
+
+ private static void printTokens(String query, Antlr3DescriptiveErrorListener errorListener)
+
+ {
+ ANTLRStringStream
+ ss = new ANTLRStringStream(query);
+ c2psLexer
+ lex = new c2psLexer(ss);
+ org.antlr.runtime.CommonTokenStream
+ tokens = new org.antlr.runtime.CommonTokenStream(lex); // v3
+
+ lex.setErrorReporter(errorListener);
+
+ // get all tokens from lexer:
+ tokens.fill();
+
+ System.out.printf("opREG.check: no. of tokens = %d.\n", tokens.size());
+ for(int i=0; i>#REG(\" a"s\")<<.
+ lex.setErrorReporter(errorListener);
+ ((c2psParser) g).setErrorReporter(errorListener);
+
+ if( DEBUG )
+ {
+ //System.out.format("opREG.check: input='%s', index=%d.\n", query, index);
+ printTokens(query, errorListener);
+ System.out.flush();
+ }
+
+
+ try {
+ c2psParser.c2ps_query_return
+ c2Return = ((c2psParser) g).c2ps_query(); // statt t().
+
+ // AST Tree anzeigen:
+ tree = (Tree) c2Return.getTree();
+ //if (DEBUG)
+ // System.out.printf("opREG.check: tree = '%s'.\n", tree.toStringTree());
+ }
+ catch (RecognitionException e) {
+ System.err.printf("c2po_opREG.check: Recognition Exception!\n");
+ }
+
+ return tree;
+ } // check
+
+
+ /**
+ * main
+ */
+
+ public static void main (String args[]) throws Exception
+
+ {
+ String input[] = { "#REG(abc)",
+ "#REG(def's)",
+ "#REG( def's )", // all blanks should be removed.
+ "#REG( ' def\\'s ' )", // same
+ "#REG( \" def's \" )", // same
+ "#REG(abc[\"]ef)",
+ "#REG('abc')", // ' fehlt: generates Syntax Error .
+ "#REG('abc\')", // User input = #REG('abc\') : OK, nothing escaped.
+ "#REG('abc\'')", // User input = #REG('abc\') : OK, nothing escaped.
+ "#REG('abc\\')", // User input = #REG('abc\') : OK, same behavior: \\ == \.
+ "#REG((a|b))", // broken input, should use ".." or '..'.
+ "#REG('(a|b)')", // OK.
+ "#REG(\"(a|b)\")", // OK.
+ "#REG(^[A-Z]+abc[\']*ung$)",
+ "#REG('ab(cd|ef)*')",
+ "#REG('abc(def|g)*[)(]')",
+ "#REG(\"abc(def|g)*[)(]\")",
+ "#REG('abc[\"]')", // User input = #REG('abc["]') : OK, needs escape => #REG("...\"...")
+ "#REG(\"abc[\\\"]\")", // User input = #REG("abc["]") : broken because of 2nd " -> syntax error.
+ "#REG(\"abc[\\\"]\")", // User input = #REG("abc[\"]"): OK, already escaped by user => #REG("...\"...")
+ "#REG(\"abc[\\\\\"]\")" // User input = #REG("abc[\\"]") : broken. with escaped " => #REG("...\"...")
+ };
+ Tree tree;
+
+ for (int i = 0; i < input.length; i++)
+ {
+ System.out.printf("c2ps_opREG: Parsing input %02d: >>%s<<\n", i, input[i]);
+ tree = check(input[i], 0);
+ System.out.printf("c2ps_opREG: tree %02d: >>%s<<.\n\n", i, tree.toStringTree());
+ }
+
+
+ } // main
+
+}
diff --git a/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java b/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java
index 69a6293a..8bbfa351 100644
--- a/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java
+++ b/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java
@@ -15,6 +15,7 @@
import de.ids_mannheim.korap.query.serialize.util.KoralObjectGenerator;
import de.ids_mannheim.korap.query.serialize.util.ResourceMapper;
import de.ids_mannheim.korap.query.serialize.util.StatusCodes;
+import de.ids_mannheim.korap.util.StringUtils;
import org.antlr.runtime.ANTLRStringStream;
import org.antlr.runtime.RecognitionException;
@@ -127,7 +128,6 @@ public class Cosmas2QueryProcessor extends Antlr3AbstractQueryProcessor {
public static Pattern wildcardPlusPattern = Pattern.compile("([+])");
public static Pattern wildcardQuestionPattern = Pattern.compile("([?])");
-
/**
* @param tree
* The syntax tree as returned by ANTLR
@@ -142,6 +142,7 @@ public Cosmas2QueryProcessor (String query) {
process(query);
if (DEBUG) {
log.debug(">>> " + requestMap.get("query") + " <<<");
+ System.out.printf("Cosmas2QueryProcessor: >>%s<<.\n", requestMap.get("query"));
}
}
@@ -151,14 +152,19 @@ public void process (String query) {
Tree tree = null;
tree = parseCosmasQuery(query);
if (DEBUG) {
+ System.out.printf("\nProcessing COSMAS II query: %s.\n\n", query);
log.debug("Processing CosmasII query: " + query);
}
- if (tree != null) {
- if (DEBUG) {
- log.debug("ANTLR parse tree: " + tree.toStringTree());
- }
+ if (tree != null)
+ {
+
+ if (DEBUG) {
+ log.debug("ANTLR parse tree: " + tree.toStringTree());
+ System.out.printf("\nANTLR parse tree: %s.\n\n", tree.toStringTree());
+ }
+
processNode(tree);
- }
+ }
}
@@ -278,6 +284,11 @@ private void processNode (Tree node) {
if (nodeCat.equals("OPBED")) {
processOPBED(node);
}
+
+ if (nodeCat.equals("OPREG")) {
+ processOPREG(node);
+ }
+
objectsToPop.push(stackedObjects);
toWrapsToPop.push(stackedToWrap);
@@ -444,6 +455,88 @@ else if (conditionCount < conditionGroups.size()) {
}
}
+ /* processOPREG:
+ *
+ * - input Node structure is: (OPREG "regexpr").
+ * - transforms tree into the corresponding Koral:token/Koral:term, like:
+ * e.g. #REG(abc[']?s) ->
+ * {
+ * "@type": "koral:term",
+ * "match": "match:eq", // optional
+ * "type" : "type:regex",
+ * "key" : "abc[']?s",
+ * "layer": "orth"
+ * }.
+ *
+ * - see doc: http://korap.github.io/Koral/
+ *
+ * 06.09.23/FB
+ */
+
+ private void processOPREG (Tree node)
+
+ {
+ int
+ nChild = node.getChildCount() - 1;
+ Tree
+ nodeChild = node.getChild(0);
+ boolean
+ bDebug = false;
+
+ if( DEBUG )
+ {
+ //System.out.printf("Debug: processOPREG: node='%s' nChilds=%d.\n", node.toStringTree(), nChild+1);
+ System.out.printf("Debug: processOPREG: child: >>%s<< cat=%s type=%d.\n",
+ nodeChild.getText(), getNodeCat(node), nodeChild.getType());
+ }
+
+ // empty case (is that possible?):
+ if( nChild < 0 )
+ return;
+
+ // see processOPWF_OPWF_OPLEM
+ // for how to insert regexpr into Koral JSON-LD
+
+ Map
+ token = KoralObjectGenerator.makeToken();
+
+ objectStack.push(token);
+ stackedObjects++;
+
+ Map
+ fieldMap = KoralObjectGenerator.makeTerm();
+
+ token.put("wrap", fieldMap);
+
+ // make category-specific fieldMap entry:
+ /*
+ System.out.printf("Debug: processOPREG: before replaceALL: >>%s<<.\n", nodeChild.toStringTree());
+ String
+ value = nodeChild.toStringTree().replaceAll("\"", "");
+ System.out.printf("Debug: processOPREG: after replaceALL: >>%s<<.\n", value);
+ */
+
+ /* replace replaceALL() by replaceIfNotEscaped() to delete every occurence of >>"<<
+ * which is not escaped by >>\<<, as it is important to keep the escaped sequence for
+ * the argument of #REG().
+ * This is not possible with replaceALL().
+ */
+ String
+ value = nodeChild.toStringTree(); // old version: replaceDoubleQuotes(nodeChild.toStringTree());
+
+ if( bDebug )
+ System.out.printf("Debug: processOPREG: key: >>%s<<.\n", value);
+
+ fieldMap.put("key", value);
+ fieldMap.put("layer", "orth");
+ fieldMap.put("type", "type:regex");
+ fieldMap.put("match", "match:eq");
+
+ // decide where to put (objPos=1, not clear why, but it works only like that - 20.09.23/FB):
+ putIntoSuperObject(token,1);
+
+ } // processOPREG
+
private void processOPNHIT (Tree node) {
Integer[] classRef = new Integer[] { classCounter + 128 + 1,
@@ -1511,19 +1604,40 @@ private Map wrap (Map[] wrapCascade) {
@SuppressWarnings("unchecked")
- private void putIntoSuperObject (Map object,
- int objStackPosition) {
- if (objectStack.size() > objStackPosition) {
+ private void putIntoSuperObject (Map object, int objStackPosition)
+
+ {
+ if( DEBUG )
+ {
+ System.out.printf("Debug: putIntosuperObject(<>,int): objectStack.size=%d objStackPos=%d object=%s.\n",
+ objectStack.size(), objStackPosition, object == null ? "null" : "not null");
+
+ if( objectStack != null && objectStack.size() > 0 )
+ System.out.printf("Debug: putIntosuperObject: objectStack = %s.\n", objectStack.toString());
+
+ if( invertedOperandsLists != null )
+ System.out.printf("Debug: putIntosuperObject: invertedOperandsLists: [%s].\n", invertedOperandsLists.toString());
+ }
+
+
+ if (objectStack.size() > objStackPosition)
+ {
ArrayList