Skip to content

Commit

Permalink
Issue #66: REG: missing #REG-Operator implemented: rebased.
Browse files Browse the repository at this point in the history
Added new tests.
Moved general purpose methods to StringUtils.java.

Change-Id: I42f12251a73511fff07b48e06f6018ba1e181433
Reviewed-on: https://korap.ids-mannheim.de/gerrit/c/KorAP/Koral/+/7658
Reviewed-by: Nils Diewald <nils@diewald-online.de>
  • Loading branch information
Bodmo committed Oct 25, 2023
1 parent 203bc7d commit 5551a7e
Show file tree
Hide file tree
Showing 8 changed files with 795 additions and 31 deletions.
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@
<exclude>**/c2ps_opIN.java</exclude>
<exclude>**/c2ps_opOV.java</exclude>
<exclude>**/c2ps_opPROX.java</exclude>
<exclude>**/c2ps_opREG.java</exclude>
<exclude>**/c2ps_opWF.java</exclude>
<exclude>**/c2ps_optCase.java</exclude>
<exclude>**/.gitignore</exclude>
Expand Down
34 changes: 25 additions & 9 deletions src/main/antlr/cosmas/c2ps.g
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
// //
// COSMAS II zeilenorientierten Suchanfragesprache (C2 plain syntax) //
// globale Grammatik (ruft lokale c2ps_x.g Grammatiken auf). //
// 17.12.12/FB //
// v-0.6 //
// TODO: //
// - se1: Einsetzen des Default-Operators in den kumulierten AST. //
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
//
// COSMAS II zeilenorientierten Suchanfragesprache (C2 plain syntax)
// globale Grammatik (ruft lokale c2ps_x.g Grammatiken auf).
// 17.12.12/FB
// v-0.6
// TODO:
// - se1: Einsetzen des Default-Operators in den kumulierten AST.
//
// v0.7 - 25.07.23/FB
// - added: #REG(x)
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

grammar c2ps;

options { output=AST; backtrack=true; k=5;}
// tokens that will appear as node names in the resulting AST:
tokens {C2PQ; OPBED; OPTS; OPBEG; OPEND; OPNHIT; OPALL; OPLEM; OPPROX;
ARG1; ARG2;
OPWF; OPLEM; OPANNOT;
Expand All @@ -21,6 +25,7 @@ tokens {C2PQ; OPBED; OPTS; OPBEG; OPEND; OPNHIT; OPALL; OPLEM; OPPROX;
OPNOT;
OPEXPR1;
OPMORPH; OPELEM;
OPREG;
}

@header {package de.ids_mannheim.korap.query.parse.cosmas;}
Expand Down Expand Up @@ -76,6 +81,14 @@ OP_IN : '#IN' | '#IN(' OP_IN_OPTS? ')' ;

OP_OV : '#OV' | '#OV(' OP_OV_OPTS? ')' ;

// #REG(abc['"]) or #REG('abc\'s') or #REG("abc\"s"):

OP_REG : '#REG(' ' '* '\'' ('\\\''|~'\'')+ '\'' (' ')* ')'
|
'#REG(' ' '* '"' ('\\"'|~'"')+ '"' (' ')* ')'
|
'#REG(' ' '* ~('\''|'"'|' ') (~(')'))* ')';

// EAVEXP wird hier eingesetzt für eine beliebige Sequenz von Zeichen bis zu ')'.
fragment OP_IN_OPTS
: EAVEXPR ;
Expand Down Expand Up @@ -241,7 +254,7 @@ opNOT : ('nicht' | 'NICHT' | 'not' | 'NOT') -> ^(OPNOT);
// OP1: Suchoperatoren mit 1 Argument:
// -----------------------------------

op1 : opBEG | opEND | opNHIT | opALL | opBED;
op1 : opBEG | opEND | opNHIT | opALL | opBED | opREG;

// #BED(serchExpr, B).
// B muss nachträglich in einer lokalen Grammatik überprüft werden.
Expand All @@ -259,3 +272,6 @@ opEND : ( '#END(' | '#RECHTS(' ) searchExpr ')' -> ^(OPEND searchExpr) ;
opNHIT : ( '#NHIT(' | '#INKLUSIVE(' ) searchExpr ')' -> ^(OPNHIT searchExpr) ;

opALL : ( '#ALL(' | '#EXKLUSIVE(' ) searchExpr ')' -> ^(OPALL searchExpr) ;

opREG : OP_REG -> ^(OPREG {c2ps_opREG.encode($OP_REG.text, OPREG)}) ;

Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ public static Tree check (String input, int index) {
c2ps_opBEDParser.opBEDOpts_return c2PQReturn = null;

/*
System.out.println("check opBED: " + index + ": " + input);
System.out.format("opBED: check: input='%s', index=%d.\n", input, index);
System.out.format("opBED: tokens ='%s'.\n", tokens.toString());
System.out.flush();
*/

Expand Down Expand Up @@ -68,7 +69,7 @@ public static Tree checkTPos (String input, int index) {


public static void main (String args[]) throws Exception {
String[] input = { ",sa,se,-ta,-te/pa,-pe)", ",sa)", ",/pa,-pe)" };
String[] input = { ",sa,se,-ta,-te/pa,-pe)", ",sa)", ",/pa,-pe)"};
Tree tree;

for (int i = 0; i < input.length; i++) {
Expand Down
235 changes: 235 additions & 0 deletions src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
package de.ids_mannheim.korap.query.parse.cosmas;

import org.antlr.runtime.*;
import org.antlr.runtime.tree.*;

import de.ids_mannheim.korap.query.serialize.util.Antlr3DescriptiveErrorListener;
import de.ids_mannheim.korap.util.StringUtils;

/*
* 1. transforms and encodes a regular COSMAS II like expression #REG(regexpr)
* into a AST tree -> encode().
* 2. transforms tree into the corresponding Koral:token/Koral:term, like:
* e.g. #REG(abc[']?s) ->
* {
* "@type": "koral:term",
* "match": "match:eq",
* "type" : "type:regex",
* "key" : "abc[']?s",
* "layer": "orth"
* }...
*
* - see doc: http://korap.github.io/Koral/
* - generation of koral:term -> processOPREG().
* 06.09.23/FB
*/

public class c2ps_opREG

{
private static boolean DEBUG = false;

/*
* encode():
*
* input = e.g. "#REG('abc(d|e)*')" -> return AST = (OPREG abc(d|e)*):
*
* Returned String: no enclosing "..." needed, so no escaping of " nor \ needed.
* 06.09.23/FB
*/
public static Tree encode (String input, int tokenType)

{
if( DEBUG )
{
System.out.printf("opREG.encode: input = >>%s<<, token type=%d.\n", input, tokenType);
System.out.flush();
}

if( input.substring(0, 5).compareToIgnoreCase("#REG(") != 0 || input.charAt(input.length()-1) != ')' )
{
// error: '#REG(' and ')' not found: return input unchanged.
if( DEBUG ) System.out.printf("opREG.encode: unexpected input = >>%s<<: nothing encoded!\n", input);
return new CommonTree(new CommonToken(tokenType, input));
}


StringBuffer sb = new StringBuffer(input.substring(5));
sb.deleteCharAt(sb.length()-1);

// #REG("a"), #REG(a), #REG('a') -> >>a<<.
// enclosing ".." are appended at the end of this function.
// a. remove blanks around ".." and '..',
// e.g. a. #REG( ' abc ' ) -> #REG(' abc ').

StringUtils.removeBlanksAtBothSides(sb);

if( sb.charAt(0) == '\'' || sb.charAt(0) == '"')
{
// remove pairwise at both ends.
sb.deleteCharAt(0);
if( sb.charAt(sb.length()-1) == '\'' || sb.charAt(sb.length()-1) == '"' )
sb.deleteCharAt(sb.length()-1);
}

// b. remove blanks inside '..' or "..",
// E.g. #REG(' abc ') -> #REG('abc'):

StringUtils.removeBlanksAtBothSides(sb);

/* unescape >>'<<, >>"<< and >>\<<.
* e.g. #REG('that\'s') -> "that\'s" -> >>that's<<.
*/

for(int i=0; i<sb.length()-1; i++)
{
if( sb.charAt(i) == '\\' &&
(sb.charAt(i+1) == '\'' || sb.charAt(i+1) == '"' || sb.charAt(i+1) == '\\' ))
sb.deleteCharAt(i);
}

/* old version:
for(int i=0; i<sb.length()-1; i++)
{
if( sb.charAt(i) == '\\' && sb.charAt(i+1) == '\'' )
sb.deleteCharAt(i);
}
*/

/* old version:
* encode2DoubleQuoted(sb);
*/

if( DEBUG )
System.out.printf("opREG.encode: encoded = >>%s<<.\n", sb.toString());

return new CommonTree(new CommonToken(tokenType, sb.toString()));

} // encode

/*
* printTokens:
* Notes:
* - must build a separate CommonTokenStream here, because
* tokens.fill() will consume all tokens.
* - prints to stdout list of tokens from lexer.
* - mainly for debugging.
* 14.09.23/FB
*
*/

private static void printTokens(String query, Antlr3DescriptiveErrorListener errorListener)

{
ANTLRStringStream
ss = new ANTLRStringStream(query);
c2psLexer
lex = new c2psLexer(ss);
org.antlr.runtime.CommonTokenStream
tokens = new org.antlr.runtime.CommonTokenStream(lex); // v3

lex.setErrorReporter(errorListener);

// get all tokens from lexer:
tokens.fill();

System.out.printf("opREG.check: no. of tokens = %d.\n", tokens.size());
for(int i=0; i<tokens.size(); i++)
System.out.printf("opREG.check: token[%2d] = %s.\n", i, tokens.get(i).getText());

} // printTokens

/* check:
* Notes:
* - must build a separate CommonTokenStream here, because
* tokens.fill() will consume all tokens.
*/

public static Tree check (String query, int index)

{
ANTLRStringStream
ss = new ANTLRStringStream(query);
c2psLexer
lex = new c2psLexer(ss);
org.antlr.runtime.CommonTokenStream
tokens = new org.antlr.runtime.CommonTokenStream(lex); // v3
c2psParser
g = new c2psParser(tokens);
Tree
tree = null;
Antlr3DescriptiveErrorListener errorListener =
new Antlr3DescriptiveErrorListener(query);

// Use custom error reporters for lex for use in printTokens(lex), or programm will break
// by broken input, e.g. >>#REG(\" a"s\")<<.
lex.setErrorReporter(errorListener);
((c2psParser) g).setErrorReporter(errorListener);

if( DEBUG )
{
//System.out.format("opREG.check: input='%s', index=%d.\n", query, index);
printTokens(query, errorListener);
System.out.flush();
}


try {
c2psParser.c2ps_query_return
c2Return = ((c2psParser) g).c2ps_query(); // statt t().

// AST Tree anzeigen:
tree = (Tree) c2Return.getTree();
//if (DEBUG)
// System.out.printf("opREG.check: tree = '%s'.\n", tree.toStringTree());
}
catch (RecognitionException e) {
System.err.printf("c2po_opREG.check: Recognition Exception!\n");
}

return tree;
} // check


/**
* main
*/

public static void main (String args[]) throws Exception

{
String input[] = { "#REG(abc)",
"#REG(def's)",
"#REG( def's )", // all blanks should be removed.
"#REG( ' def\\'s ' )", // same
"#REG( \" def's \" )", // same
"#REG(abc[\"]ef)",
"#REG('abc')", // ' fehlt: generates Syntax Error .
"#REG('abc\')", // User input = #REG('abc\') : OK, nothing escaped.
"#REG('abc\'')", // User input = #REG('abc\') : OK, nothing escaped.
"#REG('abc\\')", // User input = #REG('abc\') : OK, same behavior: \\ == \.
"#REG((a|b))", // broken input, should use ".." or '..'.
"#REG('(a|b)')", // OK.
"#REG(\"(a|b)\")", // OK.
"#REG(^[A-Z]+abc[\']*ung$)",
"#REG('ab(cd|ef)*')",
"#REG('abc(def|g)*[)(]')",
"#REG(\"abc(def|g)*[)(]\")",
"#REG('abc[\"]')", // User input = #REG('abc["]') : OK, needs escape => #REG("...\"...")
"#REG(\"abc[\\\"]\")", // User input = #REG("abc["]") : broken because of 2nd " -> syntax error.
"#REG(\"abc[\\\"]\")", // User input = #REG("abc[\"]"): OK, already escaped by user => #REG("...\"...")
"#REG(\"abc[\\\\\"]\")" // User input = #REG("abc[\\"]") : broken. with escaped " => #REG("...\"...")
};
Tree tree;

for (int i = 0; i < input.length; i++)
{
System.out.printf("c2ps_opREG: Parsing input %02d: >>%s<<\n", i, input[i]);
tree = check(input[i], 0);
System.out.printf("c2ps_opREG: tree %02d: >>%s<<.\n\n", i, tree.toStringTree());
}


} // main

}
Loading

0 comments on commit 5551a7e

Please sign in to comment.