Skip to content

Commit

Permalink
ENH: Improve performance of TextAnalyzer.merge (both memory usage an…
Browse files Browse the repository at this point in the history
…d speed)
  • Loading branch information
tsegall committed Nov 24, 2024
1 parent 4c8df69 commit 4ea9a01
Show file tree
Hide file tree
Showing 17 changed files with 197 additions and 91 deletions.
4 changes: 4 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@

## Changes ##

### 15.10.0
- ENH: Improve performance of TextAnalyzer.merge (both memory usage and speed)
- INT: Bump gradle to 8.11.1, org.springframework.boot to 3.4.0

### 15.9.0
- ENH: Remove support for TextAnalyzer.Feature.LEGACY_JSON
- ENH: cli option --json changed to --format json (default), added --format faker (to output a faker specification)
Expand Down
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ plugins {
}

wrapper {
gradleVersion = '8.11'
gradleVersion = '8.11.1'
}

tasks.register('examples') {
Expand Down
6 changes: 0 additions & 6 deletions codecov.yml

This file was deleted.

2 changes: 1 addition & 1 deletion examples/datetraining/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ plugins {
}

wrapper {
gradleVersion = '8.11'
gradleVersion = '8.11.1'
}

repositories {
Expand Down
2 changes: 1 addition & 1 deletion examples/mergesimple/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ plugins {
}

wrapper {
gradleVersion = '8.11'
gradleVersion = '8.11.1'
}

repositories {
Expand Down
2 changes: 1 addition & 1 deletion examples/minicli/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ plugins {
}

wrapper {
gradleVersion = '8.11'
gradleVersion = '8.11.1'
}

repositories {
Expand Down
2 changes: 1 addition & 1 deletion examples/modebulk/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ plugins {
}

wrapper {
gradleVersion = '8.11'
gradleVersion = '8.11.1'
}

repositories {
Expand Down
2 changes: 1 addition & 1 deletion examples/moderecord/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ plugins {
}

wrapper {
gradleVersion = '8.11'
gradleVersion = '8.11.1'
}

repositories {
Expand Down
2 changes: 1 addition & 1 deletion examples/modestreaming/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ plugins {
}

wrapper {
gradleVersion = '8.11'
gradleVersion = '8.11.1'
}

repositories {
Expand Down
4 changes: 2 additions & 2 deletions examples/web/build.gradle
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
plugins {
id 'org.springframework.boot' version '3.3.5'
id 'org.springframework.boot' version '3.4.0'
id 'io.spring.dependency-management' version '1.1.6'
id "com.github.ben-manes.versions" version '0.51.0'
id 'java'
Expand All @@ -8,7 +8,7 @@ plugins {
}

wrapper {
gradleVersion = '8.11'
gradleVersion = '8.11.1'
}

group = 'com.cobber.fta'
Expand Down
2 changes: 1 addition & 1 deletion gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.11-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.11.1-bin.zip
networkTimeout=10000
validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME
Expand Down
2 changes: 1 addition & 1 deletion settings.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ includeBuild 'examples/modestreaming'
dependencyResolutionManagement {
versionCatalogs {
libs {
version('fta', '15.9.0')
version('fta', '15.10.0')
version('jacoco', '0.8.12')

// https://mvnrepository.com/artifact/com.univocity/univocity-parsers
Expand Down
48 changes: 48 additions & 0 deletions types/src/main/java/com/cobber/fta/CacheLRU.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Copyright 2017-2024 Tim Segall
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cobber.fta;

import java.util.concurrent.TimeUnit;

import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;

public class CacheLRU<K, V> {
private final Cache<K, V> cache;

public CacheLRU(int capacity) {
this.cache = CacheBuilder.newBuilder()
.maximumSize(capacity)
.expireAfterAccess(10, TimeUnit.MINUTES)
.build();
}

public void put(K key, V value) {
cache.put(key, value);
}

public V get(K key) {
return cache.getIfPresent(key);
}

public void invalidate(K key) {
cache.invalidate(key);
}

public long size() {
return cache.size();
}
}
9 changes: 8 additions & 1 deletion types/src/main/java/com/cobber/fta/token/TokenStream.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.util.List;
import java.util.Set;

import com.cobber.fta.CacheLRU;
import com.cobber.fta.core.InternalErrorException;
import com.cobber.fta.core.RegExpGenerator;
import com.cobber.fta.core.Utils;
Expand Down Expand Up @@ -66,6 +67,8 @@ public class TokenStream {
/* The number of occurrences of this 'Pattern'. */
private long occurrences;

private final static CacheLRU<String, Automaton> cache = new CacheLRU<>(10);

/** The TokenStream that represents any input that is too long. */
public final static TokenStream ANYSHAPE = new TokenStream(Utils.repeat('x', Token.MAX_LENGTH + 1), 1);

Expand Down Expand Up @@ -462,7 +465,11 @@ public long getOccurrences() {
* @return True if the TokenStream matches the supplied Regular Expression.
*/
public boolean matches(final String regExp) {
final Automaton automaton = new RegExp(RegExpGenerator.toAutomatonRE(regExp, false), RegExp.AUTOMATON).toAutomaton(new DatatypesAutomatonProvider());
Automaton automaton = cache.get(regExp);
if (automaton == null) {
automaton = new RegExp(RegExpGenerator.toAutomatonRE(regExp, false), RegExp.AUTOMATON).toAutomaton(new DatatypesAutomatonProvider());
cache.put(regExp, automaton);
}

return matches(automaton.getInitialState(), 0);
}
Expand Down
31 changes: 31 additions & 0 deletions types/src/test/java/com/cobber/fta/AllocationTracker.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
* Copyright 2017-2024 Tim Segall
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cobber.fta;

import java.lang.management.ManagementFactory;

import com.sun.management.ThreadMXBean;
public class AllocationTracker {
ThreadMXBean threadMxBean = (ThreadMXBean) ManagementFactory.getThreadMXBean();

public AllocationTracker() {
threadMxBean.setThreadAllocatedMemoryEnabled(true);
}

public long getAllocated() {
return threadMxBean.getCurrentThreadAllocatedBytes();
}
}
135 changes: 68 additions & 67 deletions types/src/test/java/com/cobber/fta/TestDistributions.java
Original file line number Diff line number Diff line change
Expand Up @@ -767,73 +767,74 @@ public void leadingPlus() throws IOException, FTAException {
// FLAKY!!!!!
@Test(groups = { TestGroups.ALL, TestGroups.DISTRIBUTION })
public void normalCurve() throws IOException, FTAException {
for (int iter = 0; iter < 100; iter++) {
final TextAnalyzer analysis = new TextAnalyzer("normalCurve");
analysis.setLocale(Locale.forLanguageTag("en-US"));
final SecureRandom random = new SecureRandom();
final int SIZE = 100000;

for (int i = 0; i < SIZE; i++)
analysis.train(String.valueOf(random.nextGaussian()*100));

// Test pre getResult()
String serialized = analysis.serialize();
final TextAnalyzer hydrated = TextAnalyzer.deserialize(serialized);
assertTrue(analysis.equals(hydrated));

// Test a hydrated object
serialized = hydrated.serialize();
final TextAnalyzer rehydrated = TextAnalyzer.deserialize(serialized);
assertEquals(serialized, rehydrated.serialize());

final TextAnalysisResult result = rehydrated.getResult();

// Test post getResult()
serialized = rehydrated.serialize();
assertEquals(serialized, TextAnalyzer.deserialize(serialized).serialize());

assertEquals(result.getSampleCount(), SIZE);
assertEquals(result.getOutlierCount(), 0);
assertEquals(result.getMatchCount(), SIZE);
assertEquals(result.getNullCount(), 0);
assertEquals(result.getConfidence(), 1.0);
assertEquals(result.getType(), FTAType.DOUBLE);
assertEquals(result.getTypeModifier(), "SIGNED");

assertEquals(result.getMean(), 0.0, 1.0);
assertEquals(result.getStandardDeviation(), 100, 1);

final String q0_0 = result.getValueAtQuantile(0);
final String q0_5 = result.getValueAtQuantile(.5);
final String q1_0 = result.getValueAtQuantile(1.0);

// Median should be seriously close to 0
assertEquals(Double.parseDouble(q0_5), 0.0, 1.2);

// 3.5 Standard Deviations should cover low and high points
assertTrue(Double.parseDouble(q0_0) < -350);
assertTrue(Double.parseDouble(q1_0) > 350);

// 101 because we want 0.0 and 1.0 plus everything in between
final double[] percentiles = new double[101];
double value = 0.0;
for (int i = 0; i < 100; i++) {
percentiles[i] = value;
value += .01;
}
// Make sure the last one is precisely 1.0
percentiles[100] = 1.0;

final String[] answers = result.getValuesAtQuantiles(percentiles);
assertEquals(answers[0], q0_0);
assertEquals(answers[50], q0_5);
assertEquals(answers[100], q1_0);

for (int i = 10; i < 50; i++) {
double low = Double.parseDouble(answers[i]);
double high = Double.parseDouble(answers[100 - i]);
// System.err.printf("low: %f, high: %f\n", low, high);
}
final int ITERATIONS = 1;
for (int iter = 0; iter < ITERATIONS; iter++) {
final TextAnalyzer analysis = new TextAnalyzer("normalCurve");
analysis.setLocale(Locale.forLanguageTag("en-US"));
final SecureRandom random = new SecureRandom();
final int SIZE = 100000;

for (int i = 0; i < SIZE; i++)
analysis.train(String.valueOf(random.nextGaussian()*100));

// Test pre getResult()
String serialized = analysis.serialize();
final TextAnalyzer hydrated = TextAnalyzer.deserialize(serialized);
assertTrue(analysis.equals(hydrated));

// Test a hydrated object
serialized = hydrated.serialize();
final TextAnalyzer rehydrated = TextAnalyzer.deserialize(serialized);
assertEquals(serialized, rehydrated.serialize());

final TextAnalysisResult result = rehydrated.getResult();

// Test post getResult()
serialized = rehydrated.serialize();
assertEquals(serialized, TextAnalyzer.deserialize(serialized).serialize());

assertEquals(result.getSampleCount(), SIZE);
assertEquals(result.getOutlierCount(), 0);
assertEquals(result.getMatchCount(), SIZE);
assertEquals(result.getNullCount(), 0);
assertEquals(result.getConfidence(), 1.0);
assertEquals(result.getType(), FTAType.DOUBLE);
assertEquals(result.getTypeModifier(), "SIGNED");

assertEquals(result.getMean(), 0.0, 1.0);
assertEquals(result.getStandardDeviation(), 100, 1);

final String q0_0 = result.getValueAtQuantile(0);
final String q0_5 = result.getValueAtQuantile(.5);
final String q1_0 = result.getValueAtQuantile(1.0);

// Median should be seriously close to 0
assertEquals(Double.parseDouble(q0_5), 0.0, 1.2);

// 3.5 Standard Deviations should cover low and high points
assertTrue(Double.parseDouble(q0_0) < -350);
assertTrue(Double.parseDouble(q1_0) > 350);

// 101 because we want 0.0 and 1.0 plus everything in between
final double[] percentiles = new double[101];
double value = 0.0;
for (int i = 0; i < 100; i++) {
percentiles[i] = value;
value += .01;
}
// Make sure the last one is precisely 1.0
percentiles[100] = 1.0;

final String[] answers = result.getValuesAtQuantiles(percentiles);
assertEquals(answers[0], q0_0);
assertEquals(answers[50], q0_5);
assertEquals(answers[100], q1_0);

for (int i = 10; i < 50; i++) {
double low = Double.parseDouble(answers[i]);
double high = Double.parseDouble(answers[100 - i]);
// System.err.printf("low: %f, high: %f\n", low, high);
}
}
}

Expand Down
Loading

0 comments on commit 4ea9a01

Please sign in to comment.