Issue #66: REG: missing #REG-Operator implemented: rebased.

Added new tests. Moved general purpose methods to StringUtils.java. Change-Id: I42f12251a73511fff07b48e06f6018ba1e181433 Reviewed-on: https://korap.ids-mannheim.de/gerrit/c/KorAP/Koral/+/7658 Reviewed-by: Nils Diewald <nils@diewald-online.de>
KorAP · Oct 25, 2023 · 5551a7e · 5551a7e
1 parent 203bc7d
commit 5551a7e
Show file tree

Hide file tree

Showing 8 changed files with 795 additions and 31 deletions.
diff --git a/pom.xml b/pom.xml
@@ -177,6 +177,7 @@
 		          	<exclude>**/c2ps_opIN.java</exclude>
 		          	<exclude>**/c2ps_opOV.java</exclude>
 		          	<exclude>**/c2ps_opPROX.java</exclude>
+		          	<exclude>**/c2ps_opREG.java</exclude>
 		          	<exclude>**/c2ps_opWF.java</exclude>
 		          	<exclude>**/c2ps_optCase.java</exclude>
 		          	<exclude>**/.gitignore</exclude>

diff --git a/src/main/antlr/cosmas/c2ps.g b/src/main/antlr/cosmas/c2ps.g
@@ -1,16 +1,20 @@
- // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
-//												//
-// 	COSMAS II zeilenorientierten Suchanfragesprache (C2 plain syntax)			//
-// 	globale Grammatik (ruft lokale c2ps_x.g Grammatiken auf).				//
-//	17.12.12/FB										//
-//      v-0.6											//
-// TODO:											//
-// - se1: Einsetzen des Default-Operators in den kumulierten AST.				//
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+//												
+// 	COSMAS II zeilenorientierten Suchanfragesprache (C2 plain syntax)	
+// 	globale Grammatik (ruft lokale c2ps_x.g Grammatiken auf).			
+//	17.12.12/FB										
+//      v-0.6										
+// TODO:											
+// - se1: Einsetzen des Default-Operators in den kumulierten AST.		
+//
+//  v0.7 - 25.07.23/FB
+//    - added: #REG(x)
 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 
 grammar c2ps;
 
 options { output=AST; backtrack=true; k=5;}
+// tokens that will appear as node names in the resulting AST:
 tokens  {C2PQ; OPBED; OPTS; OPBEG; OPEND; OPNHIT; OPALL; OPLEM; OPPROX;
 	 ARG1; ARG2; 
 	 OPWF; OPLEM; OPANNOT;
@@ -21,6 +25,7 @@ tokens  {C2PQ; OPBED; OPTS; OPBEG; OPEND; OPNHIT; OPALL; OPLEM; OPPROX;
 	 OPNOT;
 	 OPEXPR1;
 	 OPMORPH; OPELEM;
+	 OPREG;
 	}
 
 @header {package de.ids_mannheim.korap.query.parse.cosmas;}
@@ -76,6 +81,14 @@ OP_IN	:	'#IN' | '#IN(' OP_IN_OPTS? ')' ;
 
 OP_OV	:	'#OV' | '#OV(' OP_OV_OPTS? ')' ;
 
+// #REG(abc['"]) or #REG('abc\'s') or #REG("abc\"s"):
+
+OP_REG	: '#REG(' ' '* '\'' ('\\\''|~'\'')+  '\'' (' ')* ')'	
+			| 
+		  '#REG(' ' '* '"' ('\\"'|~'"')+ '"' (' ')* ')'
+		  	|
+		  '#REG(' ' '* ~('\''|'"'|' ') (~(')'))* ')';
+
 // EAVEXP wird hier eingesetzt fÃ¼r eine beliebige Sequenz von Zeichen bis zu ')'.
 fragment OP_IN_OPTS
 	:	EAVEXPR ;
@@ -241,7 +254,7 @@ opNOT	:	('nicht' | 'NICHT' | 'not' | 'NOT') -> ^(OPNOT);
 // OP1: Suchoperatoren mit 1 Argument:
 // -----------------------------------
 
-op1	:	opBEG | opEND | opNHIT | opALL | opBED; 
+op1	:	opBEG | opEND | opNHIT | opALL | opBED | opREG; 
 
 // #BED(serchExpr, B).
 // B muss nachtrÃ¤glich in einer lokalen Grammatik Ã¼berprÃ¼ft werden.
@@ -259,3 +272,6 @@ opEND	:	( '#END(' | '#RECHTS(' ) searchExpr ')'  -> ^(OPEND searchExpr) ;
 opNHIT	:	( '#NHIT(' | '#INKLUSIVE(' ) searchExpr ')' -> ^(OPNHIT searchExpr) ;
 
 opALL	:	( '#ALL(' | '#EXKLUSIVE(' ) searchExpr ')'  -> ^(OPALL searchExpr) ;
+
+opREG	:	OP_REG -> ^(OPREG {c2ps_opREG.encode($OP_REG.text, OPREG)}) ;
+
diff --git a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java
@@ -17,7 +17,8 @@ public static Tree check (String input, int index) {
         c2ps_opBEDParser.opBEDOpts_return c2PQReturn = null;
 
         /*
-        System.out.println("check opBED: " + index + ": " + input);
+        System.out.format("opBED: check: input='%s', index=%d.\n", input, index); 
+        System.out.format("opBED: tokens ='%s'.\n", tokens.toString());
         System.out.flush();
         */
 
@@ -68,7 +69,7 @@ public static Tree checkTPos (String input, int index) {
 
 
     public static void main (String args[]) throws Exception {
-        String[] input = { ",sa,se,-ta,-te/pa,-pe)", ",sa)", ",/pa,-pe)" };
+        String[] input = { ",sa,se,-ta,-te/pa,-pe)", ",sa)", ",/pa,-pe)"};
         Tree tree;
 
         for (int i = 0; i < input.length; i++) {

diff --git a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java
@@ -0,0 +1,235 @@
+package de.ids_mannheim.korap.query.parse.cosmas;
+
+import org.antlr.runtime.*;
+import org.antlr.runtime.tree.*;
+
+import de.ids_mannheim.korap.query.serialize.util.Antlr3DescriptiveErrorListener;
+import de.ids_mannheim.korap.util.StringUtils;
+
+/*
+ * 1. transforms and encodes a regular COSMAS II like expression #REG(regexpr)
+ *    into a AST tree -> encode().
+ * 2. transforms tree into the corresponding Koral:token/Koral:term, like:
+ *    e.g. #REG(abc[']?s) ->
+ *     {
+ *      "@type": "koral:term",
+ *      "match": "match:eq",
+ *      "type" : "type:regex",
+ *      "key"  : "abc[']?s",
+ *      "layer": "orth"
+ *     }...
+ *
+ * - see doc: http://korap.github.io/Koral/
+ * - generation of koral:term -> processOPREG().
+ * 06.09.23/FB
+ */
+
+public class c2ps_opREG
+
+{
+	private static boolean DEBUG = false;
+
+	/* 
+	 * encode():
+	 * 
+	 * input = e.g. "#REG('abc(d|e)*')" -> return AST = (OPREG abc(d|e)*):
+	 * 
+	 * Returned String: no enclosing "..." needed, so no escaping of " nor \ needed.
+	 * 06.09.23/FB
+	 */
+	public static Tree encode (String input, int tokenType) 
+
+	{
+    if( DEBUG )
+    	{
+    	System.out.printf("opREG.encode: input = >>%s<<, token type=%d.\n", input, tokenType); 
+    	System.out.flush();
+    	}
+
+    if( input.substring(0, 5).compareToIgnoreCase("#REG(") != 0 || input.charAt(input.length()-1) != ')' )
+    	{
+    	// error: '#REG(' and ')' not found: return input unchanged.
+        if( DEBUG ) System.out.printf("opREG.encode: unexpected input = >>%s<<: nothing encoded!\n", input);
+    	return new CommonTree(new CommonToken(tokenType, input));
+    	}
+
+
+    StringBuffer sb = new StringBuffer(input.substring(5));
+    sb.deleteCharAt(sb.length()-1);
+
+	// #REG("a"), #REG(a), #REG('a') -> >>a<<.
+    // enclosing ".." are appended at the end of this function.
+    // a. remove blanks around ".." and '..',
+    //    e.g. a. #REG( ' abc ' ) -> #REG(' abc ').
+
+    StringUtils.removeBlanksAtBothSides(sb);
+
+	if( sb.charAt(0) == '\'' || sb.charAt(0) == '"')
+		{
+		// remove pairwise at both ends.
+		sb.deleteCharAt(0);
+		if( sb.charAt(sb.length()-1) == '\'' || sb.charAt(sb.length()-1) == '"' )
+			sb.deleteCharAt(sb.length()-1);
+		}
+
+	// b. remove blanks inside '..' or "..",
+    //    E.g. #REG(' abc ') -> #REG('abc'):
+
+	StringUtils.removeBlanksAtBothSides(sb);
+
+	/* unescape >>'<<, >>"<< and >>\<<.
+	 * e.g. #REG('that\'s') -> "that\'s" -> >>that's<<.
+	 */
+
+	for(int i=0; i<sb.length()-1; i++)
+		{
+		if( sb.charAt(i) == '\\' && 
+			(sb.charAt(i+1) == '\'' || sb.charAt(i+1) == '"' || sb.charAt(i+1) == '\\' ))
+			sb.deleteCharAt(i);
+		}
+
+	/* old version:
+	for(int i=0; i<sb.length()-1; i++)
+		{
+		if( sb.charAt(i) == '\\' && sb.charAt(i+1) == '\'' )
+			sb.deleteCharAt(i);
+		}
+	*/
+
+	/* old version:
+	 * encode2DoubleQuoted(sb);
+	 */
+
+	if( DEBUG ) 
+    	System.out.printf("opREG.encode: encoded = >>%s<<.\n", sb.toString());
+
+    return new CommonTree(new CommonToken(tokenType, sb.toString()));
+
+	} // encode
+
+	/*
+	 * printTokens:
+	 * Notes:
+	 * - must build a separate CommonTokenStream here, because
+	 *   tokens.fill() will consume all tokens.
+	 * - prints to stdout list of tokens from lexer.
+	 * - mainly for debugging.
+	 * 14.09.23/FB
+	 * 
+	 */
+
+	private static void printTokens(String query, Antlr3DescriptiveErrorListener errorListener)
+
+		{
+	    ANTLRStringStream 
+	    	ss = new ANTLRStringStream(query);
+	    c2psLexer 
+	    	lex = new c2psLexer(ss);
+	    org.antlr.runtime.CommonTokenStream 
+	    	tokens = new org.antlr.runtime.CommonTokenStream(lex); // v3
+
+        lex.setErrorReporter(errorListener);
+
+	    // get all tokens from lexer:
+		tokens.fill();
+
+		System.out.printf("opREG.check: no. of tokens = %d.\n",  tokens.size()); 
+	    for(int i=0; i<tokens.size(); i++)
+	        	System.out.printf("opREG.check: token[%2d] = %s.\n",  i, tokens.get(i).getText()); 
+
+		} // printTokens
+
+		/* check:
+		 * Notes:
+		 * - must build a separate CommonTokenStream here, because
+		 *   tokens.fill() will consume all tokens.
+		 */
+
+	   public static Tree check (String query, int index) 
+
+	   {
+	        ANTLRStringStream 
+	        	ss = new ANTLRStringStream(query);
+	        c2psLexer 
+	        	lex = new c2psLexer(ss);
+	        org.antlr.runtime.CommonTokenStream 
+	        	tokens = new org.antlr.runtime.CommonTokenStream(lex); // v3
+	        c2psParser 
+	        	g = new c2psParser(tokens);
+	        Tree 
+	        	tree = null;
+           Antlr3DescriptiveErrorListener errorListener =
+                   new Antlr3DescriptiveErrorListener(query);
+
+           // Use custom error reporters for lex for use in printTokens(lex), or programm will break
+           // by broken input, e.g. >>#REG(\" a"s\")<<.
+           lex.setErrorReporter(errorListener);
+           ((c2psParser) g).setErrorReporter(errorListener);
+
+           if( DEBUG )
+		       {
+		        //System.out.format("opREG.check: input='%s', index=%d.\n", query, index); 
+		        printTokens(query, errorListener);
+		        System.out.flush();
+		       }
+
+
+           try {
+               c2psParser.c2ps_query_return 
+               		c2Return = ((c2psParser) g).c2ps_query(); // statt t().
+
+               // AST Tree anzeigen:
+               tree = (Tree) c2Return.getTree();
+               //if (DEBUG) 
+               // 	System.out.printf("opREG.check: tree = '%s'.\n", tree.toStringTree());
+           	}
+           catch (RecognitionException e) {
+               System.err.printf("c2po_opREG.check: Recognition Exception!\n");
+           	}
+
+	     return tree;
+	    } // check
+
+
+	/** 
+	 * main
+	 */
+
+    public static void main (String args[]) throws Exception 
+
+    {
+    	String input[] = {	"#REG(abc)", 
+    						"#REG(def's)", 
+    						"#REG(  def's  )", 		// all blanks should be removed.
+    						"#REG( ' def\\'s ' )", 	// same
+    						"#REG( \" def's \" )", // same
+    						"#REG(abc[\"]ef)", 
+    						"#REG('abc')", 			// ' fehlt: generates Syntax Error .
+    						"#REG('abc\')",			// User input = #REG('abc\') : OK, nothing escaped.
+     						"#REG('abc\'')",			// User input = #REG('abc\') : OK, nothing escaped.
+     					   	"#REG('abc\\')",		// User input = #REG('abc\') : OK, same behavior: \\ == \.
+    						"#REG((a|b))",			// broken input, should use ".." or '..'.
+    						"#REG('(a|b)')",		// OK.
+    						"#REG(\"(a|b)\")",		// OK.
+    						"#REG(^[A-Z]+abc[\']*ung$)",
+    						"#REG('ab(cd|ef)*')", 
+    						"#REG('abc(def|g)*[)(]')",
+    						"#REG(\"abc(def|g)*[)(]\")",
+							"#REG('abc[\"]')",		// User input = #REG('abc["]') : OK, needs escape => #REG("...\"...")
+							"#REG(\"abc[\\\"]\")",	// User input = #REG("abc["]") : broken because of 2nd " -> syntax error.
+							"#REG(\"abc[\\\"]\")",	// User input = #REG("abc[\"]"): OK, already escaped by user => #REG("...\"...")
+							"#REG(\"abc[\\\\\"]\")"	// User input = #REG("abc[\\"]") : broken. with escaped "    => #REG("...\"...")
+							};
+    	Tree tree;
+
+    	for (int i = 0; i < input.length; i++) 
+        	{
+            System.out.printf("c2ps_opREG: Parsing input %02d: >>%s<<\n", i, input[i]);
+            tree = check(input[i], 0);
+            System.out.printf("c2ps_opREG: tree %02d: >>%s<<.\n\n", i, tree.toStringTree());
+            }
+
+
+    } // main
+
+}