Skip to content

Commit

Permalink
Data index benchmarks (#304)
Browse files Browse the repository at this point in the history
  • Loading branch information
stanbrub authored Jun 13, 2024
1 parent df70671 commit 06ad64d
Show file tree
Hide file tree
Showing 4 changed files with 317 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@
*/
final public class StandardTestRunner {
final Object testInst;
final List<String> setupQueries = new ArrayList<>();
final List<String> supportTables = new ArrayList<>();
final List<String> setupQueries = new ArrayList<>();
final List<String> preOpQueries = new ArrayList<>();
private String mainTable = "source";
private Bench api;
private Controller controller;
Expand Down Expand Up @@ -59,34 +60,57 @@ public void tables(String... names) {
mainTable = names[0];

for (String name : names) {
generateTable(name, null);
generateTable(name, null, null);
}
}

/**
* Generate a pre-defined table and set an explicit distribution for that table's data. This will override the
* <code>default.data.distribution</code> property.
* <p/>
* This method should only be called once per test.
* Generate a pre-defined table and sets an explicit distribution for that table's data. This will override the
* <code>default.data.distribution</code> property. The given table name will be used as the main table used by
* subsequent queries.
*
* @param name the table name to generate
* @param distribution the name of the distribution
* @param distribution the name of the distribution (random | runlength | ascending | descending)
*/
public void table(String name, String distribution) {
mainTable = name;
generateTable(name, distribution);
generateTable(name, distribution, null);
}

/**
* Generate a pre-defined table and set a column grouping for the resulting table. The given table name will be used
* as the main table used by subsequent queries.
* <p/>
*
* @param name the table name to generate
* @param groups
*/
public void groupedTable(String name, String... groups) {
mainTable = name;
generateTable(name, null, groups);
}

/**
* Add a query to be run outside the benchmark measurement but before the benchmark query. This query can transform
* the main table or supporting table, set up aggregations or updateby operations, etc.
* Add a query to be run directly after the main table is loaded. It is not measured. This query can transform the
* main table or supporting table, set up aggregations or updateby operations, etc.
*
* @param query the query to run before benchmark
*/
public void addSetupQuery(String query) {
setupQueries.add(query);
}

/**
* Add a query to be run directly before the measured operation is run. This query allows changes to tables or
* config that must occur after other setup queries happen but before the operation is run. When in doubt, use
* <code>addSetupQuery</code>.
*
* @param query the query to run just before the measured operation
*/
public void addPreOpQuery(String query) {
preOpQueries.add(query);
}

/**
* The {@code scale.row.count} property supplies a default for the number of rows generated for benchmark tests.
* Given that some operations use less memory than others, scaling up the generated rows per operation is more
Expand Down Expand Up @@ -198,8 +222,10 @@ Result runStaticTest(String name, String operation, String read, String... loadC
garbage_collect()
${preOpQueries}
bench_api_metrics_snapshot()
print('${logOperationBegin}')
begin_time = time.perf_counter_ns()
result = ${operation}
end_time = time.perf_counter_ns()
Expand Down Expand Up @@ -228,6 +254,7 @@ Result runIncTest(String name, String operation, String read, String... loadColu
garbage_collect()
${preOpQueries}
bench_api_metrics_snapshot()
print('${logOperationBegin}')
begin_time = time.perf_counter_ns()
Expand Down Expand Up @@ -260,6 +287,7 @@ Result runTest(String name, String query, String operation, String read, String.
query = query.replace("${loadSupportTables}", loadSupportTables());
query = query.replace("${loadColumns}", listStr(loadColumns));
query = query.replace("${setupQueries}", String.join("\n", setupQueries));
query = query.replace("${preOpQueries}", String.join("\n", preOpQueries));
query = query.replace("${operation}", operation);
query = query.replace("${logOperationBegin}", getLogSnippet("Begin", name));
query = query.replace("${logOperationEnd}", getLogSnippet("End", name));
Expand Down Expand Up @@ -340,8 +368,8 @@ void restartDocker() {
api.metrics().add(metrics);
}

void generateTable(String name, String distribution) {
var isNew = generateNamedTable(name, distribution);
void generateTable(String name, String distribution, String[] groups) {
var isNew = generateNamedTable(name, distribution, groups);
if (isNew) {
if (!api.isClosed()) {
api.setName("# Data Table Generation " + name);
Expand All @@ -350,20 +378,20 @@ void generateTable(String name, String distribution) {
}
initialize(testInst);
// This should not necessary. Why does DH need it?
generateNamedTable(name, distribution);
generateNamedTable(name, distribution, groups);
}
}

boolean generateNamedTable(String name, String distribution) {
boolean generateNamedTable(String name, String distribution, String[] groups) {
return switch (name) {
case "source" -> generateSourceTable(distribution);
case "right" -> generateRightTable(distribution);
case "timed" -> generateTimedTable(distribution);
case "source" -> generateSourceTable(distribution, groups);
case "right" -> generateRightTable(distribution, groups);
case "timed" -> generateTimedTable(distribution, groups);
default -> throw new RuntimeException("Undefined table name: " + name);
};
}

boolean generateSourceTable(String distribution) {
boolean generateSourceTable(String distribution, String[] groups) {
return api.table("source")
.add("num1", "double", "[0-4]", distribution)
.add("num2", "double", "[1-10]", distribution)
Expand All @@ -373,10 +401,11 @@ boolean generateSourceTable(String distribution) {
.add("key4", "int", "[0-98]", distribution)
.add("key5", "string", "[1-1000000]", distribution)
.withRowCount(getGeneratedRowCount())
.withColumnGrouping(groups)
.generateParquet();
}

boolean generateRightTable(String distribution) {
boolean generateRightTable(String distribution, String[] groups) {
if (distribution == null && api().property("default.data.distribution", "").equals("descending")) {
distribution = "descending";
} else {
Expand All @@ -387,12 +416,14 @@ boolean generateRightTable(String distribution) {
.add("r_key1", "string", "[1-100]", distribution)
.add("r_key2", "string", "[1-101]", distribution)
.add("r_wild", "string", "[1-10000]", distribution)
.add("r_key4", "int", "[0-98]", distribution)
.add("r_key5", "string", "[1-1010000]", distribution)
.withRowCount(1010000)
.withColumnGrouping(groups)
.generateParquet();
}

boolean generateTimedTable(String distribution) {
boolean generateTimedTable(String distribution, String[] groups) {
long minTime = 1676557157537L;
long maxTime = minTime + getGeneratedRowCount() - 1;
return api.table("timed")
Expand All @@ -404,6 +435,7 @@ boolean generateTimedTable(String distribution) {
.add("key3", "int", "[0-8]", distribution)
.add("key4", "int", "[0-98]", distribution)
.withFixedRowCount(true)
.withColumnGrouping(groups)
.generateParquet();
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/* Copyright (c) 2022-2024 Deephaven Data Labs and Patent Pending */
package io.deephaven.benchmark.tests.standard.index;

import org.junit.jupiter.api.*;
import org.junit.jupiter.api.MethodOrderer.OrderAnnotation;
import io.deephaven.benchmark.tests.standard.StandardTestRunner;

/**
* Standard tests for using the <code>data_index</code> and assessing its performance benefit to operations where it
* applies. Faster benchmarks like WhereIn will be meaningful only in their comparisons to the non-index version of the
* benchmark, since Benchmarks that run in a fraction of a second may not be meaningful for night-to-night comparison.
* The cost of loading the data index prevents scaling these faster benchmarks to meaningful operational run times.
*/
@TestMethodOrder(OrderAnnotation.class)
public class DataIndexBenefitTest {
final StandardTestRunner runner = new StandardTestRunner(this);

void setup(int rowFactor, int staticFactor, int incFactor) {
runner.setRowFactor(rowFactor);
runner.tables("source", "right");
runner.setScaleFactors(staticFactor, incFactor);

var setup = """
from deephaven.experimental.data_index import data_index
QueryTable = jpy.get_type('io.deephaven.engine.table.impl.QueryTable')
QueryTable.setMemoizeResults(False)
filter_table = empty_table(1000).update([
'set1=``+(ii % 10)', 'set2=``+(ii % 11)', 'set3=(int)(ii % 8)',
])
""";
runner.addSetupQuery(setup);
}

@Test
@Order(1)
void dataIndexWhereInNoIndex() {
setup(1, 250, 225);

var op = """
source.where_in(filter_table, cols=['key1 = set1', 'key2 = set2', 'key4 = set3'])
QueryTable.setMemoizeResults(True)
""";
runner.test("DataIndex-WhereIn No Index 1M Unique Combos", 999900, op, "num1", "key1", "key2", "key4");
}

@Test
@Order(2)
void dataIndexWhereInIndexedStatic() {
setup(1, 8, 0);

var preOp = """
source_idx = data_index(source, ['key1','key2','key4'])
source_idx.table
filter_table_idx = data_index(filter_table, ['set1','set2','set3'])
filter_table_idx.table
""";
runner.addPreOpQuery(preOp);

var op = """
source.where_in(filter_table, cols=['key1 = set1', 'key2 = set2', 'key4 = set3'])
QueryTable.setMemoizeResults(True)
""";
runner.test("DataIndex-WhereIn Indexed 1M Unique Combos", 999900, op, "num1", "key1", "key2", "key4");
}

@Test
@Order(3)
void dataIndexWhereInIndexedInc() {
setup(1, 0, 2);

var preOp = """
source_idx = data_index(source, ['key1','key2','key4'])
source_idx.table
filter_table_idx = data_index(filter_table, ['set1','set2','set3'])
filter_table_idx.table
""";
runner.addPreOpQuery(preOp);

var op = """
source.where_in(filter_table, cols=['key1 = set1', 'key2 = set2', 'key4 = set3'])
QueryTable.setMemoizeResults(True)
""";
runner.test("DataIndex-WhereIn Indexed 1M Unique Combos", 999900, op, "num1", "key1", "key2", "key4");
}

@Test
@Order(4)
void dataIndexAvgByNoIndex() {
setup(1, 6, 0);

var op = """
source.avg_by(by=['key1','key2','key4'])
QueryTable.setMemoizeResults(True)
""";
runner.test("DataIndex-AvgBy No Index 1M Unique Combos", 999900, op, "num1", "key1", "key2", "key4");
}

@Test
@Order(5)
void dataIndexAvgByIndexed() {
setup(1, 8, 0);

var preOp = """
source_idx = data_index(source, ['key1','key2','key4'])
source_idx.table
""";
runner.addPreOpQuery(preOp);

var op = """
source.avg_by(by=['key1','key2','key4'])
QueryTable.setMemoizeResults(True)
""";
runner.test("DataIndex-AvgBy Indexed 1M Unique Combos", 999900, op, "num1", "key1", "key2", "key4");
}


@Test
@Order(6)
void dataIndexSortNoIndex() {
setup(1, 1, 0);

var op = """
source.sort(['key1','key2','key4'])
QueryTable.setMemoizeResults(True)
""";
runner.test("DataIndex-Sort No Index 1M Unique Combos", op, "num1", "key1", "key2", "key4");
}

@Test
@Order(7)
void dataIndexSortIndexed() {
setup(1, 8, 0);

var setup = """
source_idx = data_index(source, ['key1','key2','key4'])
source_idx.table
""";
runner.addSetupQuery(setup);

var op = """
source.sort(['key1','key2','key4'])
QueryTable.setMemoizeResults(True)
""";
runner.test("DataIndex-Sort Indexed 1M Unique Combos", op, "num1", "key1", "key2", "key4");
}

@Test
@Order(8)
void dataIndexAsOfJoinNoIndex() {
setup(1, 4, 0);

var op = """
source.aj(right, on=['key1 = r_wild', 'key2 = r_key2', 'key4 >= r_key4'])
QueryTable.setMemoizeResults(True)
""";
runner.test("DataIndex-AsOfJoin No Index 1M Unique Combos", op, "num1", "key1", "key2", "key4");
}

@Test
@Order(9)
void dataIndexAsOfJoinIndexed() {
setup(1, 4, 0);

var setup = """
source_idx = data_index(source, ['key1','key2','key4'])
source_idx.table
right_idx = data_index(right, ['r_wild','r_key2','r_key4'])
right_idx.table
""";
runner.addSetupQuery(setup);

var op = """
source.aj(right, on=['key1 = r_wild', 'key2 = r_key2', 'key4 >= r_key4'])
QueryTable.setMemoizeResults(True)
""";
runner.test("DataIndex-AsOfJoin Indexed 1M Unique Combos", op, "num1", "key1", "key2", "key4");
}

}
Loading

0 comments on commit 06ad64d

Please sign in to comment.