Skip to content

Commit

Permalink
OPENNLP-1655: Add constructors in SentenceDetectorME and TokenizerME …
Browse files Browse the repository at this point in the history
…to inject custom abbreviation dictionar (#694)
  • Loading branch information
mawiesne authored Nov 25, 2024
1 parent a238e18 commit 374bee9
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,21 @@ public SentenceDetectorME(String language) throws IOException {
* @param model the {@link SentenceModel}
*/
public SentenceDetectorME(SentenceModel model) {
SentenceDetectorFactory sdFactory = model.getFactory();
this(model, model.getAbbreviations());
}

/**
* Instantiates a {@link SentenceDetectorME} with an existing {@link SentenceModel}.
*
* @param model The {@link SentenceModel} to be used.
* @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
*/
public SentenceDetectorME(SentenceModel model, Dictionary abbDict) {
this.model = model.getMaxentModel();
this.abbDict = abbDict;
SentenceDetectorFactory sdFactory = model.getFactory();
cgen = sdFactory.getSDContextGenerator();
scanner = sdFactory.getEndOfSentenceScanner();
abbDict = model.getAbbreviations();
useTokenEnd = sdFactory.isUseTokenEnd();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.io.IOException;

import opennlp.tools.commons.ThreadSafe;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.util.DownloadUtil;
import opennlp.tools.util.Span;

Expand All @@ -43,6 +44,7 @@
public class ThreadSafeSentenceDetectorME implements SentenceDetector, AutoCloseable {

private final SentenceModel model;
private final Dictionary abbDict;

private final ThreadLocal<SentenceDetectorME> threadLocal = new ThreadLocal<>();

Expand All @@ -63,15 +65,25 @@ public ThreadSafeSentenceDetectorME(String language) throws IOException {
* @param model A valid {@link SentenceModel}.
*/
public ThreadSafeSentenceDetectorME(SentenceModel model) {
super();
this(model, model.getAbbreviations());
}

/**
* Instantiates a {@link ThreadSafeSentenceDetectorME} with an existing {@link SentenceModel}.
*
* @param model The {@link SentenceModel} to be used.
* @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
*/
public ThreadSafeSentenceDetectorME(SentenceModel model, Dictionary abbDict) {
this.model = model;
this.abbDict = abbDict;
}

// If a thread-local version exists, return it. Otherwise, create, then return.
private SentenceDetectorME getSD() {
SentenceDetectorME sd = threadLocal.get();
if (sd == null) {
sd = new SentenceDetectorME(model);
sd = new SentenceDetectorME(model, abbDict);
threadLocal.set(sd);
}
return sd;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.io.IOException;

import opennlp.tools.commons.ThreadSafe;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.util.DownloadUtil;
import opennlp.tools.util.Span;

Expand All @@ -43,6 +44,7 @@
public class ThreadSafeTokenizerME implements Tokenizer, AutoCloseable {

private final TokenizerModel model;
private final Dictionary abbDict;

private final ThreadLocal<TokenizerME> threadLocal = new ThreadLocal<>();

Expand All @@ -63,14 +65,24 @@ public ThreadSafeTokenizerME(String language) throws IOException {
* @param model A valid {@link TokenizerModel}.
*/
public ThreadSafeTokenizerME(TokenizerModel model) {
super();
this(model, model.getAbbreviations());
}

/**
* Instantiates a {@link ThreadSafeTokenizerME} with an existing {@link TokenizerModel}.
*
* @param model The {@link TokenizerModel} to be used.
* @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
*/
public ThreadSafeTokenizerME(TokenizerModel model, Dictionary abbDict) {
this.model = model;
this.abbDict = abbDict;
}

private TokenizerME getTokenizer() {
TokenizerME tokenizer = threadLocal.get();
if (tokenizer == null) {
tokenizer = new TokenizerME(model);
tokenizer = new TokenizerME(model, abbDict);
threadLocal.set(tokenizer);
}
return tokenizer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,23 @@ public TokenizerME(String language) throws IOException {
* @param model The {@link TokenizerModel} to be used.
*/
public TokenizerME(TokenizerModel model) {
this(model, model.getAbbreviations());
}

/**
* Instantiates a {@link TokenizerME} with an existing {@link TokenizerModel}.
*
* @param model The {@link TokenizerModel} to be used.
* @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
*/
public TokenizerME(TokenizerModel model, Dictionary abbDict) {
this.model = model.getMaxentModel();
this.abbDict = abbDict;
TokenizerFactory factory = model.getFactory();
this.alphanumeric = factory.getAlphaNumericPattern();
this.cg = factory.getContextGenerator();
this.model = model.getMaxentModel();
this.alphanumeric = factory.getAlphaNumericPattern();
this.useAlphaNumericOptimization = factory.isUseAlphaNumericOptimization();

abbDict = model.getAbbreviations();
newTokens = new ArrayList<>();
tokProbs = new ArrayList<>(50);
}
Expand Down

0 comments on commit 374bee9

Please sign in to comment.