From b3586efc956824ed10a64d00db5e3582179f9b30 Mon Sep 17 00:00:00 2001 From: Chirag Singh <137233133+chirag-s-db@users.noreply.github.com> Date: Mon, 18 Mar 2024 13:31:06 -0700 Subject: [PATCH] Add support for `like()` function (#130) * add like support * Update README.md * Update SplToCatalystTest.scala --- README.md | 2 +- .../spl/catalyst/SplToCatalyst.scala | 24 ++++++++++++++ .../spl/catalyst/SplToCatalystTest.scala | 33 +++++++++++++++++++ 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7c532ea6..c2e44d4d 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ There's basic support for the most used commands like `addtotals`, `bin`, `colle `streamstats`, `table`, `where`. There's also basic support for functions like `auto()`, `cidr_match()`, `coalesce()`, `count()`, -`ctime()`, `earliest()`, `if()`, `isnotnull()`, `latest()`, `len()`, `lower()`, `max()`, +`ctime()`, `earliest()`, `if()`, `isnotnull()`, `latest()`, `len()`, `like()`, `lower()`, `max()`, `memk()`, `min()`, `mvappend()`, `mvcount()`, `mvfilter()`, `mvindex()`, `none()`, `null()`, `num()`, `replace()`, `rmcomma()`, `rmunit()`, `round()`, `strftime()`, `substr()`, `sum()`, `term()`, `values()`. diff --git a/src/main/scala/com/databricks/labs/transpiler/spl/catalyst/SplToCatalyst.scala b/src/main/scala/com/databricks/labs/transpiler/spl/catalyst/SplToCatalyst.scala index 4126111e..0aec26e2 100644 --- a/src/main/scala/com/databricks/labs/transpiler/spl/catalyst/SplToCatalyst.scala +++ b/src/main/scala/com/databricks/labs/transpiler/spl/catalyst/SplToCatalyst.scala @@ -173,6 +173,30 @@ object SplToCatalyst extends Logging { determineMax(ctx, call) case "len" => Length(attrOrExpr(ctx, call.args.head)) + case "like" => + val field = attrOrExpr(ctx, call.args.head) + val pattern = attrOrExpr(ctx, call.args(1)) + pattern match { + case Literal(patternLiteral: UTF8String, StringType) => + val patternString = patternLiteral.toString + // If the pattern is a simple LIKE (%foo%) pattern, we can convert it into a CONTAINS + // expression. + // For this to be safe, the pattern must start with %, end with % (unescaped), and contain + // exactly 2 instances of the wildcard character %. Note that this approach is + // conservative, as there may exist cases like %foo\%bar% that can be safely converted + // (as the wildcard in the middle of the string is escaped). + if (patternString.length > 2 && + patternString.charAt(0) == '%' && + patternString.charAt(patternString.length - 1) == '%' && + patternString.charAt(patternString.length - 2) != '\\' && + patternString.count(_ == '%') == 2) { + Contains(field, + Literal(patternString.substring(1, patternString.length - 1))) + } else { + Like(field, pattern, '\\') + } + case _ => Like(field, pattern, '\\') + } case "substr" => val str = attrOrExpr(ctx, call.args.head) val pos = expression(ctx, call.args(1)) diff --git a/src/test/scala/com/databricks/labs/transpiler/spl/catalyst/SplToCatalystTest.scala b/src/test/scala/com/databricks/labs/transpiler/spl/catalyst/SplToCatalystTest.scala index aba73a2d..d3277268 100644 --- a/src/test/scala/com/databricks/labs/transpiler/spl/catalyst/SplToCatalystTest.scala +++ b/src/test/scala/com/databricks/labs/transpiler/spl/catalyst/SplToCatalystTest.scala @@ -1096,6 +1096,39 @@ class SplToCatalystTest extends AnyFunSuite with PlanTestBase { ) } + test("simple LIKE converted to CONTAINS") { + check(ast.SearchCommand( + ast.Call("like", Seq(ast.Field("a"), ast.StrValue("%foo%")))), + (_, tree) => + Filter( + Contains( + UnresolvedAttribute("a"), + Literal.create("foo")), + tree) + ) + } + + test("complex LIKE not converted to CONTAINS") { + check(ast.SearchCommand( + ast.Call("like", Seq(ast.Field("a"), ast.StrValue("%foo%bar%")))), + (_, tree) => + Filter( + Like( + UnresolvedAttribute("a"), + Literal.create("%foo%bar%"), '\\'), + tree) + ) + check(ast.SearchCommand( + ast.Call("like", Seq(ast.Field("a"), ast.StrValue("%foo\\%")))), + (_, tree) => + Filter( + Like( + UnresolvedAttribute("a"), + Literal.create("%foo\\%"), '\\'), + tree) + ) + } + test("eventstats max(colA) AS maxA by colC") { check(ast.EventStatsCommand( allNum = false,