Skip to content

Commit

Permalink
OPENNLP-1629 Update DownloadUtil to support more languages via new UD…
Browse files Browse the repository at this point in the history
… models

- updates opennlp-models dependency to 1.1
- adapts DownloadUtil to work with ud-models-1.1 release, adding 18 new supported languages
- adapts JUnit tests accordingly to include the new model files
- replaces content of 'index.html' copy of the released model list (v 1.1.0) for DownloadParserTest
  • Loading branch information
mawiesne committed Oct 28, 2024
1 parent f41857c commit f4d8316
Show file tree
Hide file tree
Showing 8 changed files with 471 additions and 128 deletions.
56 changes: 43 additions & 13 deletions opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,13 @@ public enum ModelType {
}

private static final String BASE_URL = "https://dlcdn.apache.org/opennlp/";
private static final String MODELS_UD_MODELS_1_0 = "models/ud-models-1.0/";
private static final String MODELS_UD_MODELS_1_1 = "models/ud-models-1.1/";

public static final Map<String, Map<ModelType, String>> available_models;

static {
try {
available_models = new DownloadParser(new URL(BASE_URL + MODELS_UD_MODELS_1_0)).getAvailableModels();
available_models = new DownloadParser(new URL(BASE_URL + MODELS_UD_MODELS_1_1)).getAvailableModels();
} catch (MalformedURLException e) {
throw new RuntimeException(e);
}
Expand Down Expand Up @@ -214,7 +214,6 @@ static class DownloadParser {
}

Map<String, Map<ModelType, String>> getAvailableModels() {

final Matcher matcher = LINK_PATTERN.matcher(fetchPageIndex());

final List<String> links = new ArrayList<>();
Expand All @@ -226,27 +225,58 @@ Map<String, Map<ModelType, String>> getAvailableModels() {
}

private Map<String, Map<ModelType, String>> toMap(List<String> links) {

final Map<String, Map<ModelType, String>> result = new HashMap<>();

for (String link : links) {

if (link.endsWith(".bin")) {
if (link.contains("de-ud")) {
if (link.contains("de-ud")) { // German
addModel("de", link, result);
} else if (link.contains("en-ud")) {
} else if (link.contains("en-ud")) { // English
addModel("en", link, result);
} else if (link.contains("it-ud")) {
} else if (link.contains("it-ud")) { // Italian
addModel("it", link, result);
} else if (link.contains("nl-ud")) {
} else if (link.contains("nl-ud")) { // Dutch
addModel("nl", link, result);
} else if (link.contains("fr-ud")) {
} else if (link.contains("fr-ud")) { // French
addModel("fr", link, result);
} else if (link.contains("bg-ud")) { // Bulgarian
addModel("bg", link, result);
} else if (link.contains("cs-ud")) { // Czech
addModel("cs", link, result);
} else if (link.contains("hr-ud")) { // Croatian
addModel("hr", link, result);
} else if (link.contains("da-ud")) { // Danish
addModel("da", link, result);
} else if (link.contains("es-ud")) { // Spanish
addModel("es", link, result);
} else if (link.contains("et-ud")) { // Estonian
addModel("et", link, result);
} else if (link.contains("fi-ud")) { // Finnish
addModel("fi", link, result);
} else if (link.contains("lv-ud")) { // Latvian
addModel("lv", link, result);
} else if (link.contains("no-ud")) { // Norwegian
addModel("no", link, result);
} else if (link.contains("pl-ud")) { // Polish
addModel("pl", link, result);
} else if (link.contains("pt-ud")) { // Portuguese
addModel("pt", link, result);
} else if (link.contains("ro-ud")) { // Romanian
addModel("ro", link, result);
} else if (link.contains("ru-ud")) { // Russian
addModel("ru", link, result);
} else if (link.contains("sr-ud")) { // Serbian
addModel("sr", link, result);
} else if (link.contains("sk-ud")) { // Slovak
addModel("sk", link, result);
} else if (link.contains("sl-ud")) { // Slovenian
addModel("sl", link, result);
} else if (link.contains("sv-ud")) { // Swedish
addModel("sv", link, result);
} else if (link.contains("uk-ud")) { // Ukrainian
addModel("uk", link, result);
}
}

}

return result;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public void setup() {
@ParameterizedTest(name = "Verify \"{0}\" POS model loading")
@ValueSource(strings = {"en-ud-ewt", "de-ud-gsd"})
public void testLoadModelByLanguage(String langModel) throws IOException {
String modelName = "opennlp-" + langModel + "-pos-1.0-1.9.3.bin";
String modelName = "opennlp-" + langModel + "-pos-1.1-2.4.0.bin";
POSModel model = loader.loadModel(Files.newInputStream(OPENNLP_DIR.resolve(modelName)));
Assertions.assertNotNull(model);
Assertions.assertTrue(model.isLoadedFromSerialized());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public void setup() {
@ParameterizedTest(name = "Verify \"{0}\" sentence model loading")
@ValueSource(strings = {"en-ud-ewt", "de-ud-gsd"})
public void testLoadModelByLanguage(String langModel) throws IOException {
String modelName = "opennlp-" + langModel + "-sentence-1.0-1.9.3.bin";
String modelName = "opennlp-" + langModel + "-sentence-1.1-2.4.0.bin";
SentenceModel model = loader.loadModel(Files.newInputStream(OPENNLP_DIR.resolve(modelName)));
Assertions.assertNotNull(model);
Assertions.assertTrue(model.isLoadedFromSerialized());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public void setup() {
@ParameterizedTest(name = "Verify \"{0}\" tokenizer model loading")
@ValueSource(strings = {"en-ud-ewt", "de-ud-gsd"})
public void testLoadModelByLanguage(String langModel) throws IOException {
String modelName = "opennlp-" + langModel + "-tokens-1.0-1.9.3.bin";
String modelName = "opennlp-" + langModel + "-tokens-1.1-2.4.0.bin";
TokenizerModel model = loader.loadModel(Files.newInputStream(OPENNLP_DIR.resolve(modelName)));
Assertions.assertNotNull(model);
Assertions.assertTrue(model.isLoadedFromSerialized());
Expand Down
126 changes: 100 additions & 26 deletions opennlp-tools/src/test/java/opennlp/tools/util/DownloadParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ void testAvailableModels(String language, Map<DownloadUtil.ModelType, String> ex
Map<String, Map<DownloadUtil.ModelType, String>> result = downloadParser.getAvailableModels();

assertNotNull(result);
assertEquals(5, result.size());
assertEquals(23, result.size());

final Map<DownloadUtil.ModelType, String> availableModels = result.get(language);
assertNotNull(availableModels);
Expand Down Expand Up @@ -78,35 +78,109 @@ private URL fromClasspath(String file) {
return Thread.currentThread().getContextClassLoader().getResource(file);
}

private static final String OPENNLP = "opennlp-";
private static final String MODEL_SENT = "sentence-";
private static final String MODEL_TOK = "tokens-";
private static final String MODEL_POS = "pos-";
private static final String VER = "1.1-2.4.0";
private static final String BIN = ".bin";

// Note: This needs to be public as JUnit 5 requires it like this.
public static Stream<Arguments> expectedModels() {
// Data as defined in "test/resources/opennlp/tools/util/index.html"
return Stream.of(
Arguments.of("en",
Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, "opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin",
DownloadUtil.ModelType.TOKENIZER, "opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin",
DownloadUtil.ModelType.POS, "opennlp-en-ud-ewt-pos-1.0-1.9.3.bin")),
Arguments.of("fr",
Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, "opennlp-1.0-1.9.3fr-ud-ftb-sentence-1.0-1.9.3.bin",
DownloadUtil.ModelType.TOKENIZER, "opennlp-fr-ud-ftb-tokens-1.0-1.9.3.bin",
DownloadUtil.ModelType.POS, "opennlp-fr-ud-ftb-pos-1.0-1.9.3.bin")),
Arguments.of("de",
Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, "opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin",
DownloadUtil.ModelType.TOKENIZER, "opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin",
DownloadUtil.ModelType.POS, "opennlp-de-ud-gsd-pos-1.0-1.9.3.bin")),
Arguments.of("it",
Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, "opennlp-it-ud-vit-sentence-1.0-1.9.3.bin",
DownloadUtil.ModelType.TOKENIZER, "opennlp-it-ud-vit-tokens-1.0-1.9.3.bin",
DownloadUtil.ModelType.POS, "opennlp-it-ud-vit-pos-1.0-1.9.3.bin")),
Arguments.of("nl",
Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, "opennlp-nl-ud-alpino-sentence-1.0-1.9.3.bin",
DownloadUtil.ModelType.TOKENIZER, "opennlp-nl-ud-alpino-tokens-1.0-1.9.3.bin",
DownloadUtil.ModelType.POS, "opennlp-nl-ud-alpino-pos-1.0-1.9.3.bin"))
Arguments.of("en",Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "en-ud-ewt-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "en-ud-ewt-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "en-ud-ewt-" + MODEL_POS + VER + BIN)),
Arguments.of("fr", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "fr-ud-gsd-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "fr-ud-gsd-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "fr-ud-gsd-" + MODEL_POS + VER + BIN)),
Arguments.of("de", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "de-ud-gsd-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "de-ud-gsd-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "de-ud-gsd-" + MODEL_POS + VER + BIN)),
Arguments.of("it", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "it-ud-vit-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "it-ud-vit-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "it-ud-vit-" + MODEL_POS + VER + BIN)),
Arguments.of("bg", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "bg-ud-btb-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "bg-ud-btb-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "bg-ud-btb-" + MODEL_POS + VER + BIN)),
Arguments.of("cs", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "cs-ud-pdt-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "cs-ud-pdt-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "cs-ud-pdt-" + MODEL_POS + VER + BIN)),
Arguments.of("da", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "da-ud-ddt-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "da-ud-ddt-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "da-ud-ddt-" + MODEL_POS + VER + BIN)),
Arguments.of("es", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "es-ud-gsd-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "es-ud-gsd-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "es-ud-gsd-" + MODEL_POS + VER + BIN)),
Arguments.of("et", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "et-ud-edt-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "et-ud-edt-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "et-ud-edt-" + MODEL_POS + VER + BIN)),
Arguments.of("fi", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "fi-ud-tdt-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "fi-ud-tdt-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "fi-ud-tdt-" + MODEL_POS + VER + BIN)),
Arguments.of("hr", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "hr-ud-set-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "hr-ud-set-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "hr-ud-set-" + MODEL_POS + VER + BIN)),
Arguments.of("lv", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "lv-ud-lvtb-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "lv-ud-lvtb-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "lv-ud-lvtb-" + MODEL_POS + VER + BIN)),
Arguments.of("lv", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "lv-ud-lvtb-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "lv-ud-lvtb-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "lv-ud-lvtb-" + MODEL_POS + VER + BIN)),
Arguments.of("no", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "no-ud-bokmaal-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "no-ud-bokmaal-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "no-ud-bokmaal-" + MODEL_POS + VER + BIN)),
Arguments.of("pl", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "pl-ud-pdb-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "pl-ud-pdb-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "pl-ud-pdb-" + MODEL_POS + VER + BIN)),
Arguments.of("pt", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "pt-ud-gsd-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "pt-ud-gsd-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "pt-ud-gsd-" + MODEL_POS + VER + BIN)),
Arguments.of("ro", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "ro-ud-rrt-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "ro-ud-rrt-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "ro-ud-rrt-" + MODEL_POS + VER + BIN)),
Arguments.of("ru", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "ru-ud-gsd-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "ru-ud-gsd-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "ru-ud-gsd-" + MODEL_POS + VER + BIN)),
Arguments.of("sr", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "sr-ud-set-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "sr-ud-set-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "sr-ud-set-" + MODEL_POS + VER + BIN)),
Arguments.of("sk", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "sk-ud-snk-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "sk-ud-snk-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "sk-ud-snk-" + MODEL_POS + VER + BIN)),
Arguments.of("sl", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "sl-ud-ssj-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "sl-ud-ssj-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "sl-ud-ssj-" + MODEL_POS + VER + BIN)),
Arguments.of("sv", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "sv-ud-talbanken-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "sv-ud-talbanken-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "sv-ud-talbanken-" + MODEL_POS + VER + BIN)),
Arguments.of("uk", Map.of(
DownloadUtil.ModelType.SENTENCE_DETECTOR, OPENNLP + "uk-ud-iu-" + MODEL_SENT + VER + BIN,
DownloadUtil.ModelType.TOKENIZER, OPENNLP + "uk-ud-iu-" + MODEL_TOK + VER + BIN,
DownloadUtil.ModelType.POS, OPENNLP + "uk-ud-iu-" + MODEL_POS + VER + BIN))
);
}
}
Loading

0 comments on commit f4d8316

Please sign in to comment.