Data index benchmarks (#304)

deephaven · Jun 13, 2024 · 06ad64d · 06ad64d
1 parent df70671
commit 06ad64d
Show file tree

Hide file tree

Showing 4 changed files with 317 additions and 27 deletions.
diff --git a/src/it/java/io/deephaven/benchmark/tests/standard/StandardTestRunner.java b/src/it/java/io/deephaven/benchmark/tests/standard/StandardTestRunner.java
@@ -22,8 +22,9 @@
  */
 final public class StandardTestRunner {
     final Object testInst;
-    final List<String> setupQueries = new ArrayList<>();
     final List<String> supportTables = new ArrayList<>();
+    final List<String> setupQueries = new ArrayList<>();
+    final List<String> preOpQueries = new ArrayList<>();
     private String mainTable = "source";
     private Bench api;
     private Controller controller;
@@ -59,34 +60,57 @@ public void tables(String... names) {
             mainTable = names[0];
 
         for (String name : names) {
-            generateTable(name, null);
+            generateTable(name, null, null);
         }
     }
 
     /**
-     * Generate a pre-defined table and set an explicit distribution for that table's data. This will override the
-     * <code>default.data.distribution</code> property.
-     * <p/>
-     * This method should only be called once per test.
+     * Generate a pre-defined table and sets an explicit distribution for that table's data. This will override the
+     * <code>default.data.distribution</code> property. The given table name will be used as the main table used by
+     * subsequent queries.
      * 
      * @param name the table name to generate
-     * @param distribution the name of the distribution
+     * @param distribution the name of the distribution (random | runlength | ascending | descending)
      */
     public void table(String name, String distribution) {
         mainTable = name;
-        generateTable(name, distribution);
+        generateTable(name, distribution, null);
+    }
+
+    /**
+     * Generate a pre-defined table and set a column grouping for the resulting table. The given table name will be used
+     * as the main table used by subsequent queries.
+     * <p/>
+     * 
+     * @param name the table name to generate
+     * @param groups
+     */
+    public void groupedTable(String name, String... groups) {
+        mainTable = name;
+        generateTable(name, null, groups);
     }
 
     /**
-     * Add a query to be run outside the benchmark measurement but before the benchmark query. This query can transform
-     * the main table or supporting table, set up aggregations or updateby operations, etc.
+     * Add a query to be run directly after the main table is loaded. It is not measured. This query can transform the
+     * main table or supporting table, set up aggregations or updateby operations, etc.
      * 
      * @param query the query to run before benchmark
      */
     public void addSetupQuery(String query) {
         setupQueries.add(query);
     }
 
+    /**
+     * Add a query to be run directly before the measured operation is run. This query allows changes to tables or
+     * config that must occur after other setup queries happen but before the operation is run. When in doubt, use
+     * <code>addSetupQuery</code>.
+     * 
+     * @param query the query to run just before the measured operation
+     */
+    public void addPreOpQuery(String query) {
+        preOpQueries.add(query);
+    }
+
     /**
      * The {@code scale.row.count} property supplies a default for the number of rows generated for benchmark tests.
      * Given that some operations use less memory than others, scaling up the generated rows per operation is more
@@ -198,8 +222,10 @@ Result runStaticTest(String name, String operation, String read, String... loadC
 
         garbage_collect()
 
+        ${preOpQueries}
         bench_api_metrics_snapshot()
         print('${logOperationBegin}')
+        
         begin_time = time.perf_counter_ns()
         result = ${operation}
         end_time = time.perf_counter_ns()
@@ -228,6 +254,7 @@ Result runIncTest(String name, String operation, String read, String... loadColu
         
         garbage_collect()
         
+        ${preOpQueries}
         bench_api_metrics_snapshot()
         print('${logOperationBegin}')
         begin_time = time.perf_counter_ns()
@@ -260,6 +287,7 @@ Result runTest(String name, String query, String operation, String read, String.
         query = query.replace("${loadSupportTables}", loadSupportTables());
         query = query.replace("${loadColumns}", listStr(loadColumns));
         query = query.replace("${setupQueries}", String.join("\n", setupQueries));
+        query = query.replace("${preOpQueries}", String.join("\n", preOpQueries));
         query = query.replace("${operation}", operation);
         query = query.replace("${logOperationBegin}", getLogSnippet("Begin", name));
         query = query.replace("${logOperationEnd}", getLogSnippet("End", name));
@@ -340,8 +368,8 @@ void restartDocker() {
         api.metrics().add(metrics);
     }
 
-    void generateTable(String name, String distribution) {
-        var isNew = generateNamedTable(name, distribution);
+    void generateTable(String name, String distribution, String[] groups) {
+        var isNew = generateNamedTable(name, distribution, groups);
         if (isNew) {
             if (!api.isClosed()) {
                 api.setName("# Data Table Generation " + name);
@@ -350,20 +378,20 @@ void generateTable(String name, String distribution) {
             }
             initialize(testInst);
             // This should not necessary. Why does DH need it?
-            generateNamedTable(name, distribution);
+            generateNamedTable(name, distribution, groups);
         }
     }
 
-    boolean generateNamedTable(String name, String distribution) {
+    boolean generateNamedTable(String name, String distribution, String[] groups) {
         return switch (name) {
-            case "source" -> generateSourceTable(distribution);
-            case "right" -> generateRightTable(distribution);
-            case "timed" -> generateTimedTable(distribution);
+            case "source" -> generateSourceTable(distribution, groups);
+            case "right" -> generateRightTable(distribution, groups);
+            case "timed" -> generateTimedTable(distribution, groups);
             default -> throw new RuntimeException("Undefined table name: " + name);
         };
     }
 
-    boolean generateSourceTable(String distribution) {
+    boolean generateSourceTable(String distribution, String[] groups) {
         return api.table("source")
                 .add("num1", "double", "[0-4]", distribution)
                 .add("num2", "double", "[1-10]", distribution)
@@ -373,10 +401,11 @@ boolean generateSourceTable(String distribution) {
                 .add("key4", "int", "[0-98]", distribution)
                 .add("key5", "string", "[1-1000000]", distribution)
                 .withRowCount(getGeneratedRowCount())
+                .withColumnGrouping(groups)
                 .generateParquet();
     }
 
-    boolean generateRightTable(String distribution) {
+    boolean generateRightTable(String distribution, String[] groups) {
         if (distribution == null && api().property("default.data.distribution", "").equals("descending")) {
             distribution = "descending";
         } else {
@@ -387,12 +416,14 @@ boolean generateRightTable(String distribution) {
                 .add("r_key1", "string", "[1-100]", distribution)
                 .add("r_key2", "string", "[1-101]", distribution)
                 .add("r_wild", "string", "[1-10000]", distribution)
+                .add("r_key4", "int", "[0-98]", distribution)
                 .add("r_key5", "string", "[1-1010000]", distribution)
                 .withRowCount(1010000)
+                .withColumnGrouping(groups)
                 .generateParquet();
     }
 
-    boolean generateTimedTable(String distribution) {
+    boolean generateTimedTable(String distribution, String[] groups) {
         long minTime = 1676557157537L;
         long maxTime = minTime + getGeneratedRowCount() - 1;
         return api.table("timed")
@@ -404,6 +435,7 @@ boolean generateTimedTable(String distribution) {
                 .add("key3", "int", "[0-8]", distribution)
                 .add("key4", "int", "[0-98]", distribution)
                 .withFixedRowCount(true)
+                .withColumnGrouping(groups)
                 .generateParquet();
     }
 

diff --git a/src/it/java/io/deephaven/benchmark/tests/standard/index/DataIndexBenefitTest.java b/src/it/java/io/deephaven/benchmark/tests/standard/index/DataIndexBenefitTest.java
@@ -0,0 +1,179 @@
+/* Copyright (c) 2022-2024 Deephaven Data Labs and Patent Pending */
+package io.deephaven.benchmark.tests.standard.index;
+
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.MethodOrderer.OrderAnnotation;
+import io.deephaven.benchmark.tests.standard.StandardTestRunner;
+
+/**
+ * Standard tests for using the <code>data_index</code> and assessing its performance benefit to operations where it
+ * applies. Faster benchmarks like WhereIn will be meaningful only in their comparisons to the non-index version of the
+ * benchmark, since Benchmarks that run in a fraction of a second may not be meaningful for night-to-night comparison.
+ * The cost of loading the data index prevents scaling these faster benchmarks to meaningful operational run times.
+ */
+@TestMethodOrder(OrderAnnotation.class)
+public class DataIndexBenefitTest {
+    final StandardTestRunner runner = new StandardTestRunner(this);
+
+    void setup(int rowFactor, int staticFactor, int incFactor) {
+        runner.setRowFactor(rowFactor);
+        runner.tables("source", "right");
+        runner.setScaleFactors(staticFactor, incFactor);
+
+        var setup = """
+        from deephaven.experimental.data_index import data_index
+        QueryTable = jpy.get_type('io.deephaven.engine.table.impl.QueryTable')
+        QueryTable.setMemoizeResults(False)
+        filter_table = empty_table(1000).update([
+            'set1=``+(ii % 10)', 'set2=``+(ii % 11)', 'set3=(int)(ii % 8)',
+        ])
+        """;
+        runner.addSetupQuery(setup);
+    }
+
+    @Test
+    @Order(1)
+    void dataIndexWhereInNoIndex() {
+        setup(1, 250, 225);
+
+        var op = """
+        source.where_in(filter_table, cols=['key1 = set1', 'key2 = set2', 'key4 = set3'])
+        QueryTable.setMemoizeResults(True)
+        """;
+        runner.test("DataIndex-WhereIn No Index 1M Unique Combos", 999900, op, "num1", "key1", "key2", "key4");
+    }
+
+    @Test
+    @Order(2)
+    void dataIndexWhereInIndexedStatic() {
+        setup(1, 8, 0);
+
+        var preOp = """
+        source_idx = data_index(source, ['key1','key2','key4'])
+        source_idx.table
+        filter_table_idx = data_index(filter_table, ['set1','set2','set3'])
+        filter_table_idx.table
+        """;
+        runner.addPreOpQuery(preOp);
+
+        var op = """
+        source.where_in(filter_table, cols=['key1 = set1', 'key2 = set2', 'key4 = set3'])
+        QueryTable.setMemoizeResults(True)
+        """;
+        runner.test("DataIndex-WhereIn Indexed 1M Unique Combos", 999900, op, "num1", "key1", "key2", "key4");
+    }
+
+    @Test
+    @Order(3)
+    void dataIndexWhereInIndexedInc() {
+        setup(1, 0, 2);
+
+        var preOp = """
+        source_idx = data_index(source, ['key1','key2','key4'])
+        source_idx.table
+        filter_table_idx = data_index(filter_table, ['set1','set2','set3'])
+        filter_table_idx.table
+        """;
+        runner.addPreOpQuery(preOp);
+
+        var op = """
+        source.where_in(filter_table, cols=['key1 = set1', 'key2 = set2', 'key4 = set3'])
+        QueryTable.setMemoizeResults(True)
+        """;
+        runner.test("DataIndex-WhereIn Indexed 1M Unique Combos", 999900, op, "num1", "key1", "key2", "key4");
+    }
+
+    @Test
+    @Order(4)
+    void dataIndexAvgByNoIndex() {
+        setup(1, 6, 0);
+
+        var op = """
+        source.avg_by(by=['key1','key2','key4'])
+        QueryTable.setMemoizeResults(True)
+        """;
+        runner.test("DataIndex-AvgBy No Index 1M Unique Combos", 999900, op, "num1", "key1", "key2", "key4");
+    }
+
+    @Test
+    @Order(5)
+    void dataIndexAvgByIndexed() {
+        setup(1, 8, 0);
+
+        var preOp = """
+        source_idx = data_index(source, ['key1','key2','key4'])
+        source_idx.table
+        """;
+        runner.addPreOpQuery(preOp);
+
+        var op = """
+        source.avg_by(by=['key1','key2','key4'])
+        QueryTable.setMemoizeResults(True)
+        """;
+        runner.test("DataIndex-AvgBy Indexed 1M Unique Combos", 999900, op, "num1", "key1", "key2", "key4");
+    }
+
+
+    @Test
+    @Order(6)
+    void dataIndexSortNoIndex() {
+        setup(1, 1, 0);
+
+        var op = """
+        source.sort(['key1','key2','key4'])
+        QueryTable.setMemoizeResults(True)
+        """;
+        runner.test("DataIndex-Sort No Index 1M Unique Combos", op, "num1", "key1", "key2", "key4");
+    }
+
+    @Test
+    @Order(7)
+    void dataIndexSortIndexed() {
+        setup(1, 8, 0);
+
+        var setup = """
+        source_idx = data_index(source, ['key1','key2','key4'])
+        source_idx.table
+        """;
+        runner.addSetupQuery(setup);
+
+        var op = """
+        source.sort(['key1','key2','key4'])
+        QueryTable.setMemoizeResults(True)
+        """;
+        runner.test("DataIndex-Sort Indexed 1M Unique Combos", op, "num1", "key1", "key2", "key4");
+    }
+
+    @Test
+    @Order(8)
+    void dataIndexAsOfJoinNoIndex() {
+        setup(1, 4, 0);
+
+        var op = """
+        source.aj(right, on=['key1 = r_wild', 'key2 = r_key2', 'key4 >= r_key4'])
+        QueryTable.setMemoizeResults(True)
+        """;
+        runner.test("DataIndex-AsOfJoin No Index 1M Unique Combos", op, "num1", "key1", "key2", "key4");
+    }
+
+    @Test
+    @Order(9)
+    void dataIndexAsOfJoinIndexed() {
+        setup(1, 4, 0);
+
+        var setup = """
+        source_idx = data_index(source, ['key1','key2','key4'])
+        source_idx.table
+        right_idx = data_index(right, ['r_wild','r_key2','r_key4'])
+        right_idx.table
+        """;
+        runner.addSetupQuery(setup);
+
+        var op = """
+        source.aj(right, on=['key1 = r_wild', 'key2 = r_key2', 'key4 >= r_key4'])
+        QueryTable.setMemoizeResults(True)
+        """;
+        runner.test("DataIndex-AsOfJoin Indexed 1M Unique Combos", op, "num1", "key1", "key2", "key4");
+    }
+
+}