From 919bec9a02773a8997c7fa2ee94cd6e2762d6f3e Mon Sep 17 00:00:00 2001 From: Mike Bryant Date: Fri, 3 Mar 2023 12:32:48 +0000 Subject: [PATCH] Use a library for sloppy date parsing Remove support for dangerous Bundesarchiv date range format (1906/08, meaning the years) and add support for e.g. `1933 ca.` and `1939/09 - 1945/05`. Refactor some confusing parsing code (and intro- duce some more of our own for ranges.) --- ehri-io/pom.xml | 7 ++ .../importers/base/AbstractImporter.java | 7 ++ .../project/importers/eac/EacImporter.java | 7 +- .../project/importers/ead/EadImporter.java | 13 +-- .../project/importers/eag/EagImporter.java | 2 +- .../importers/managers/SaxImportManager.java | 1 - .../project/importers/util/DateParser.java | 108 ++---------------- .../project/importers/util/DateRange.java | 84 ++++++++++++++ .../importers/util/DateRangeParser.java | 104 +++++++++++++++++ .../project/importers/util/ImportHelpers.java | 12 -- .../importers/ead/BundesarchiveSplitTest.java | 21 ++-- .../importers/ead/BundesarchiveVcTest.java | 4 +- .../importers/ead/StadsarchiefAdamTest.java | 4 +- .../project/importers/ead/Wp2JmpEadTest.java | 4 +- .../project/importers/ead/YadVashemTest.java | 8 +- .../importers/util/DateParserTest.java | 32 +----- .../importers/util/DateRangeParserTest.java | 37 ++++++ .../project/importers/util/DateRangeTest.java | 16 +++ 18 files changed, 294 insertions(+), 177 deletions(-) create mode 100644 ehri-io/src/main/java/eu/ehri/project/importers/util/DateRange.java create mode 100644 ehri-io/src/main/java/eu/ehri/project/importers/util/DateRangeParser.java create mode 100644 ehri-io/src/test/java/eu/ehri/project/importers/util/DateRangeParserTest.java create mode 100644 ehri-io/src/test/java/eu/ehri/project/importers/util/DateRangeTest.java diff --git a/ehri-io/pom.xml b/ehri-io/pom.xml index e93990d0d..1bfb2a3ba 100644 --- a/ehri-io/pom.xml +++ b/ehri-io/pom.xml @@ -102,6 +102,13 @@ 1.4.1 + + + com.github.sisyphsu + dateparser + 1.0.11 + + com.fasterxml.jackson.dataformat diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/base/AbstractImporter.java b/ehri-io/src/main/java/eu/ehri/project/importers/base/AbstractImporter.java index 8b3dca922..c8f7672d1 100644 --- a/ehri-io/src/main/java/eu/ehri/project/importers/base/AbstractImporter.java +++ b/ehri-io/src/main/java/eu/ehri/project/importers/base/AbstractImporter.java @@ -28,7 +28,10 @@ import eu.ehri.project.importers.ImportCallback; import eu.ehri.project.importers.ImportLog; import eu.ehri.project.importers.ImportOptions; +import eu.ehri.project.importers.links.LinkResolver; +import eu.ehri.project.importers.util.DateParser; import eu.ehri.project.models.base.Accessible; +import eu.ehri.project.models.base.Accessor; import eu.ehri.project.models.base.Actioner; import eu.ehri.project.models.base.PermissionScope; import eu.ehri.project.persistence.BundleManager; @@ -44,6 +47,8 @@ public abstract class AbstractImporter implements ItemI protected final GraphManager manager; protected final ImportOptions options; protected final ImportLog log; + protected final DateParser dateParser; + protected final LinkResolver linkResolver; private final List callbacks = Lists.newArrayList(); private final List errorCallbacks = Lists.newArrayList(); @@ -91,6 +96,8 @@ public AbstractImporter(FramedGraph graph, PermissionScope scope, Actioner ac this.log = log; this.options = options; manager = GraphManagerFactory.getInstance(graph); + linkResolver = new LinkResolver(graph, actioner.as(Accessor.class)); + dateParser = new DateParser(); } @Override diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/eac/EacImporter.java b/ehri-io/src/main/java/eu/ehri/project/importers/eac/EacImporter.java index 0565af3a8..dbb89d26c 100644 --- a/ehri-io/src/main/java/eu/ehri/project/importers/eac/EacImporter.java +++ b/ehri-io/src/main/java/eu/ehri/project/importers/eac/EacImporter.java @@ -29,12 +29,10 @@ import eu.ehri.project.importers.ImportLog; import eu.ehri.project.importers.ImportOptions; import eu.ehri.project.importers.base.AbstractImporter; -import eu.ehri.project.importers.links.LinkResolver; import eu.ehri.project.importers.util.ImportHelpers; import eu.ehri.project.models.AccessPointType; import eu.ehri.project.models.EntityClass; import eu.ehri.project.models.HistoricalAgent; -import eu.ehri.project.models.base.Accessor; import eu.ehri.project.models.base.Actioner; import eu.ehri.project.models.base.Description; import eu.ehri.project.models.base.PermissionScope; @@ -57,8 +55,6 @@ public class EacImporter extends AbstractImporter, Historica private static final String REL_TYPE = "type"; private static final String REL_NAME = "name"; - private final LinkResolver linkResolver; - /** * Construct an EacImporter object. * @@ -70,7 +66,6 @@ public class EacImporter extends AbstractImporter, Historica */ public EacImporter(FramedGraph graph, PermissionScope permissionScope, Actioner actioner, ImportOptions options, ImportLog log) { super(graph, permissionScope, actioner, options, log); - linkResolver = new LinkResolver(graph, actioner.as(Accessor.class)); } @Override @@ -93,7 +88,7 @@ public HistoricalAgent importItem(Map itemData) throws Validatio // Add dates and descriptions to the bundle since they're @Dependent // relations. - for (Map dpb : ImportHelpers.extractDates(itemData)) { + for (Map dpb : dateParser.extractDates(itemData)) { descBundle = descBundle.withRelation(Ontology.ENTITY_HAS_DATE, Bundle.of(EntityClass.DATE_PERIOD, dpb)); } diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/ead/EadImporter.java b/ehri-io/src/main/java/eu/ehri/project/importers/ead/EadImporter.java index b36bab2d3..f10bd58b9 100644 --- a/ehri-io/src/main/java/eu/ehri/project/importers/ead/EadImporter.java +++ b/ehri-io/src/main/java/eu/ehri/project/importers/ead/EadImporter.java @@ -30,21 +30,15 @@ import eu.ehri.project.importers.ImportLog; import eu.ehri.project.importers.ImportOptions; import eu.ehri.project.importers.base.AbstractImporter; -import eu.ehri.project.importers.links.LinkResolver; import eu.ehri.project.importers.util.ImportHelpers; import eu.ehri.project.models.AccessPointType; import eu.ehri.project.models.DocumentaryUnit; import eu.ehri.project.models.EntityClass; import eu.ehri.project.models.Repository; import eu.ehri.project.models.base.AbstractUnit; -import eu.ehri.project.models.base.Accessor; import eu.ehri.project.models.base.Actioner; import eu.ehri.project.models.base.PermissionScope; -import eu.ehri.project.persistence.Bundle; -import eu.ehri.project.persistence.BundleManager; -import eu.ehri.project.persistence.Messages; -import eu.ehri.project.persistence.Mutation; -import eu.ehri.project.persistence.Serializer; +import eu.ehri.project.persistence.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -67,7 +61,6 @@ public class EadImporter extends AbstractImporter, AbstractU //the EadImporter can import ead as DocumentaryUnits, the default, or overwrite those and create VirtualUnits instead. private final EntityClass unitEntity = EntityClass.DOCUMENTARY_UNIT; private final Serializer mergeSerializer; - private final LinkResolver linkResolver; public static final String ACCESS_POINT = "AccessPoint"; @@ -83,8 +76,6 @@ public class EadImporter extends AbstractImporter, AbstractU public EadImporter(FramedGraph graph, PermissionScope permissionScope, Actioner actioner, ImportOptions options, ImportLog log) { super(graph, permissionScope, actioner, options, log); mergeSerializer = new Serializer.Builder(graph).dependentOnly().build(); - linkResolver = new LinkResolver(graph, actioner.as(Accessor.class)); - } /** @@ -150,7 +141,7 @@ public AbstractUnit importItem(Map itemData, List idPath * @throws ValidationError when data constraints are not met */ protected Bundle getDescription(Map itemData) throws ValidationError { - List> extractedDates = ImportHelpers.extractDates(itemData); + List> extractedDates = dateParser.extractDates(itemData); Map raw = ImportHelpers.extractDescription(itemData, EntityClass.DOCUMENTARY_UNIT_DESCRIPTION); diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/eag/EagImporter.java b/ehri-io/src/main/java/eu/ehri/project/importers/eag/EagImporter.java index 942f7d5a6..ca6974c35 100644 --- a/ehri-io/src/main/java/eu/ehri/project/importers/eag/EagImporter.java +++ b/ehri-io/src/main/java/eu/ehri/project/importers/eag/EagImporter.java @@ -111,7 +111,7 @@ public Repository importItem(Map itemData) throws ValidationErro // Add dates and descriptions to the bundle since they're @Dependent // relations. - for (Map dpb : ImportHelpers.extractDates(itemData)) { + for (Map dpb : dateParser.extractDates(itemData)) { descBundle = descBundle.withRelation(Ontology.ENTITY_HAS_DATE, Bundle.of(EntityClass.DATE_PERIOD, dpb)); } diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/managers/SaxImportManager.java b/ehri-io/src/main/java/eu/ehri/project/importers/managers/SaxImportManager.java index 50c9d5ef0..5c7ebc923 100644 --- a/ehri-io/src/main/java/eu/ehri/project/importers/managers/SaxImportManager.java +++ b/ehri-io/src/main/java/eu/ehri/project/importers/managers/SaxImportManager.java @@ -54,7 +54,6 @@ public class SaxImportManager extends AbstractImportManager { private static final Logger logger = LoggerFactory.getLogger(SaxImportManager.class); - private static final Config config = ConfigFactory.load(); private final Class handlerClass; private final ImportOptions options; private final List extraCallbacks; diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/util/DateParser.java b/ehri-io/src/main/java/eu/ehri/project/importers/util/DateParser.java index 935ca5aad..1fc8d42ce 100644 --- a/ehri-io/src/main/java/eu/ehri/project/importers/util/DateParser.java +++ b/ehri-io/src/main/java/eu/ehri/project/importers/util/DateParser.java @@ -5,26 +5,17 @@ import eu.ehri.project.definitions.Entities; import eu.ehri.project.definitions.Ontology; import eu.ehri.project.importers.properties.XmlImportProperties; -import org.joda.time.DateTime; -import org.joda.time.format.DateTimeFormatter; -import org.joda.time.format.ISODateTimeFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.text.ParsePosition; -import java.text.SimpleDateFormat; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Optional; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import static eu.ehri.project.importers.util.ImportHelpers.getSubNode; /** - * This class contains static functions to extract date information from - * largely unstructured maps. + * Class for extracting date info from unstructured or semi-structured data and text. * * There are two main scenarios: * @@ -37,43 +28,15 @@ * Notable, the function that returns the dates removes the data from * which they were extracted */ -class DateParser { +public class DateParser { private static final Logger logger = LoggerFactory.getLogger(DateParser.class); - - // Various date patterns - private static final Pattern[] datePatterns = { - // Yad Vashem, ICA-Atom style: 1924-1-1 - 1947-12-31 - // Yad Vashem in Wp2: 12-15-1941, 9-30-1944 - Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})\\s?-\\s?(\\d{4}-\\d{1,2}-\\d{1,2})$"), - Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})$"), - Pattern.compile("^(\\d{4})\\s?-\\s?(\\d{4})$"), - Pattern.compile("^(\\d{4})-\\[(\\d{4})\\]$"), - Pattern.compile("^(\\d{4})-\\[(\\d{4})\\]$"), - Pattern.compile("^(\\d{4}s)-\\[(\\d{4}s)\\]$"), - Pattern.compile("^\\[(\\d{4})\\]$"), - Pattern.compile("^(\\d{4})$"), - Pattern.compile("^(\\d{2})th century$"), - Pattern.compile("^\\s*(\\d{4})\\s*-\\s*(\\d{4})"), - //bundesarchive: 1906/19 - Pattern.compile("^\\s*(\\d{4})/(\\d{2})"), - Pattern.compile("^\\s*(\\d{4})\\s*/\\s*(\\d{4})"), - Pattern.compile("^(\\d{4}-\\d{1,2})/(\\d{4}-\\d{1,2})"), - Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})/(\\d{4}-\\d{1,2}-\\d{1,2})"), - Pattern.compile("^(\\d{4})/(\\d{4}-\\d{1,2}-\\d{1,2})") - }; - - // NB: Using English locale here to avoid ambiguities caused by system dependent - // time zones such as: Cannot parse "1940-05-16": Illegal instant due to time zone - // offset transition (Europe/Amsterdam) - // https://en.wikipedia.org/wiki/UTC%2B00:20 - private static final DateTimeFormatter isoDateTimeFormat = ISODateTimeFormat.date().withLocale(Locale.ENGLISH); - - // NB: Not static yet since these objects aren't thread safe :( - private static final SimpleDateFormat yearMonthDateFormat = new SimpleDateFormat("yyyy-MM"); - private static final SimpleDateFormat yearDateFormat = new SimpleDateFormat("yyyy"); private static final XmlImportProperties dates = new XmlImportProperties("dates.properties"); + private final DateRangeParser rangeParser; + public DateParser() { + rangeParser = new DateRangeParser(); + } /** * Extract a set of dates from input data. The input data is mutated to @@ -82,7 +45,7 @@ class DateParser { * @param data a map of input data * @return a list of parsed date period maps */ - static List> extractDates(Map data) { + public List> extractDates(Map data) { List> extractedDates = Lists.newArrayList(); if (data.containsKey(Entities.DATE_PERIOD)) { @@ -108,7 +71,7 @@ static List> extractDates(Map data) { return extractedDates; } - private static void replaceDates(Map data, List> extractedDates, Map dateValues) { + private void replaceDates(Map data, List> extractedDates, Map dateValues) { Map dateTypes = Maps.newHashMap(); for (String dateValue : dateValues.keySet()) { dateTypes.put(dateValues.get(dateValue), null); @@ -134,23 +97,8 @@ private static void replaceDates(Map data, List> extractDate(String date) { - Map data = matchDate(date); - return data.isEmpty() ? Optional.empty() : Optional.of(data); - } - - private static Map matchDate(String date) { - Map data = Maps.newHashMap(); - for (Pattern re : datePatterns) { - Matcher matcher = re.matcher(date); - if (matcher.matches()) { - data.put(Ontology.DATE_PERIOD_START_DATE, normaliseDate(matcher.group(1))); - data.put(Ontology.DATE_PERIOD_END_DATE, normaliseDate(matcher.group(matcher.groupCount() > 1 ? 2 : 1), true)); - data.put(Ontology.DATE_HAS_DESCRIPTION, date); - break; - } - } - return data; + private Optional> extractDate(String date) { + return rangeParser.parse(date).map(DateRange::data); } private static Map returnDatesAsString(Map data) { @@ -172,40 +120,4 @@ private static Map returnDatesAsString(Map data) } return datesAsString; } - - static String normaliseDate(String date) { - return normaliseDate(date, false); - } - - /** - * Normalise a date in a string. - * - * @param date a String date that needs formatting - * @param endOfPeriod a string signifying whether this date is the begin of - * a period or the end of a period - * @return a String containing the formatted date. - */ - static String normaliseDate(String date, boolean endOfPeriod) { - String returnDate = isoDateTimeFormat.print(DateTime.parse(date)); - if (returnDate.startsWith("00")) { - returnDate = "19" + returnDate.substring(2); - date = "19" + date; - } - if (endOfPeriod) { - if (!date.equals(returnDate)) { - ParsePosition p = new ParsePosition(0); - yearMonthDateFormat.parse(date, p); - if (p.getIndex() > 0) { - returnDate = isoDateTimeFormat.print(DateTime.parse(date).plusMonths(1).minusDays(1)); - } else { - p = new ParsePosition(0); - yearDateFormat.parse(date, p); - if (p.getIndex() > 0) { - returnDate = isoDateTimeFormat.print(DateTime.parse(date).plusYears(1).minusDays(1)); - } - } - } - } - return returnDate; - } } diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/util/DateRange.java b/ehri-io/src/main/java/eu/ehri/project/importers/util/DateRange.java new file mode 100644 index 000000000..7cc7b9039 --- /dev/null +++ b/ehri-io/src/main/java/eu/ehri/project/importers/util/DateRange.java @@ -0,0 +1,84 @@ +package eu.ehri.project.importers.util; + +import eu.ehri.project.definitions.Ontology; +import org.apache.jena.ext.com.google.common.collect.Maps; + +import java.time.*; +import java.time.format.DateTimeFormatter; +import java.util.Map; +import java.util.Objects; + +public class DateRange { + + private static final DateTimeFormatter isoDateFormat = DateTimeFormatter.ISO_LOCAL_DATE; + + private final LocalDate start; + private final LocalDate end; + private final String description; + + public DateRange(LocalDate start, LocalDate end, String description) { + if (start == null) { + throw new IllegalArgumentException("DateRange start must not be null"); + } + this.start = start; + this.end = end; + this.description = description; + } + + public static DateRange of(LocalDate start, LocalDate end, String description) { + return new DateRange(start, end, description); + } + + public DateRange(LocalDate start, String description) { + this(start, null, description); + } + + @Override + public String toString() { + return end != null + ? String.format("%s - %s", toLocalDateString(start), toLocalDateString(end)) + : toLocalDateString(start); + } + + /** + * Debug constructor: creates a DateRange from a "YYYY-MM-DD - YYYY-MM-DD" + * string. + */ + public static DateRange fromString(String s, String description) { + final String[] split = s.split("\\s-\\s"); + final LocalDate d1 = LocalDate.parse(split[0]); + final LocalDate d2 = split.length > 1 ? LocalDate.parse(split[1]) : null; + return new DateRange(d1, d2, description); + } + + public Map data() { + Map data = Maps.newHashMap(); + data.put(Ontology.DATE_PERIOD_START_DATE, toLocalDateString(start)); + if (end != null) { + data.put(Ontology.DATE_PERIOD_END_DATE, toLocalDateString(end)); + } + if (description != null) { + data.put(Ontology.DATE_HAS_DESCRIPTION, description); + } + return data; + } + + private String toLocalDateString(LocalDate instant) { + return instant.format(isoDateFormat); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + DateRange dateRange = (DateRange) o; + return start.equals(dateRange.start) + && Objects.equals(end, dateRange.end) + && description.equals(dateRange.description); + } + + @Override + public int hashCode() { + return Objects.hash(start, end, description); + } +} diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/util/DateRangeParser.java b/ehri-io/src/main/java/eu/ehri/project/importers/util/DateRangeParser.java new file mode 100644 index 000000000..a06fed66c --- /dev/null +++ b/ehri-io/src/main/java/eu/ehri/project/importers/util/DateRangeParser.java @@ -0,0 +1,104 @@ +package eu.ehri.project.importers.util; + +import com.github.sisyphsu.dateparser.DateParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.LocalDate; +import java.time.format.DateTimeParseException; +import java.time.temporal.TemporalAdjusters; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class DateRangeParser { + + private static final Logger logger = LoggerFactory.getLogger(DateRangeParser.class); + private static final Pattern yearRange = Pattern.compile("^(?\\d{4})\\s?[\\-/]\\s?(?\\d{4})$"); + private static final String SEP_CHARS = "/-"; + + private final DateParser parser; + + public DateRangeParser() { + parser = DateParser.newBuilder() + .addRule("(?\\d{4}) ca\\.?") + .addRule("summer (?\\d{4})") + .addRule("(?\\d{4})[/\\-](?\\d)[/\\-](?\\d)") + .addRule("(?\\d{2})/(?\\d{4})") + .addRule("(?\\d{4})-(?\\d{2})") + .build(); + } + + private Optional parseRange(String from, String to, String orig) { + final LocalDate d1 = parser.parseDateTime(from).toLocalDate(); + final LocalDate d2 = parser.parseDateTime(to).toLocalDate(); + + // If we don't have a specific day or month, set these to the appropriate maximum... + if (to.replaceAll("\\D", "").length() < 6) { + return Optional.of(DateRange.of( + d1, + d2.with(TemporalAdjusters.lastDayOfYear()), + orig + )); + } else if (to.replaceAll("\\D", "").length() < 8) { + // FIXME: this heuristic could fail on date with 2-digit year, + // like 40, which we don't support anyway... + return Optional.of(DateRange.of( + d1, + d2.with(TemporalAdjusters.lastDayOfMonth()), + orig + )); + } else { + return Optional.of(DateRange.of(d1, d2, orig)); + } + } + + private Optional parseSingle(String date, String orig) { + final LocalDate d = parser.parseDateTime(date).toLocalDate(); + if (date.replaceAll("\\D", "").length() == 4) { + LocalDate d2 = d.with(TemporalAdjusters.lastDayOfYear()); + return Optional.of(DateRange.of(d, d2, orig)); + } + return Optional.of(DateRange.of(d, null, orig)); + } + + /** + * Heuristically attempt to parse a date range. This handles valid dates + * separated by a hyphen (optionally with surrounding spaces.) + * + * @param str a date range string + * @return an DateRange if the string is parsable, or an empty optional if not + */ + public Optional parse(String str) { + try { + // See if the string matches a year range... + final Matcher matcher = yearRange.matcher(str); + if (matcher.matches()) { + return parseRange(matcher.group("start"), matcher.group("end"), str); + } else if (str.contains(" - ")) { + // If it contains a separator with whitespace + final String[] parts = str.split("\\s-\\s"); + return parseRange(parts[0], parts[1], str); + } else { + final int mid = str.length() / 2; + final char midChar = str.charAt(mid); + // If the total string is greater or equal to the minimum length + // for a YEAR-MONTH range and the middle char is a range separator + // attempt to parse each part as a date... + if (str.length() > 12 && SEP_CHARS.indexOf(midChar) != -1) { + // Heuristics: if a string is longer than 12 chars and the + // middle char is a '-', assume it's a date range... + return parseRange( + str.subSequence(0, mid).toString().trim(), + str.subSequence(mid + 1, str.length()).toString().trim(), str); + } else { + // Otherwise, attempt to parse as a single date, or fail... + return parseSingle(str, str); + } + } + } catch (IllegalArgumentException | DateTimeParseException e) { + logger.debug(String.format("Unable to parse date range %s", str), e); + return Optional.empty(); + } + } +} diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/util/ImportHelpers.java b/ehri-io/src/main/java/eu/ehri/project/importers/util/ImportHelpers.java index 38ddd3296..8510a2b83 100644 --- a/ehri-io/src/main/java/eu/ehri/project/importers/util/ImportHelpers.java +++ b/ehri-io/src/main/java/eu/ehri/project/importers/util/ImportHelpers.java @@ -167,18 +167,6 @@ public static Map extractAddress(Map itemData) { return address; } - /** - * Extract a list of entity bundles for DatePeriods from the data, - * attempting to parse the unitdate attribute. - * - * @param data the data map. This is an out parameter from which - * keys associated with extracted dates will be removed - * @return a list of entity bundles - */ - public static List> extractDates(Map data) { - return DateParser.extractDates(data); - } - /** * Extract the data from a sub-node. * diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveSplitTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveSplitTest.java index 953832014..2d8cd82bf 100644 --- a/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveSplitTest.java +++ b/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveSplitTest.java @@ -34,10 +34,7 @@ import java.io.InputStream; import java.util.List; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertTrue; +import static org.junit.Assert.*; public class BundesarchiveSplitTest extends AbstractImporterTest { @@ -66,13 +63,13 @@ public void bundesarchiveTest() throws Exception { // How many new nodes will have been created? We should have // - 1 more DocumentaryUnits (archdesc) // - 1 more DocumentDescription - // - 1 more DatePeriod + // - 0 more DatePeriods // - 1 more UnknownProperties // - 3 more Relationships // - 2 more import Event links (1 for every Unit, 1 for the User) // - 1 more import Event // - 5 more MaintenanceEvents (4 revised, 1 created) - int newCount = origCount + 9 + 1 + 4 + 1; + int newCount = origCount + 8 + 1 + 4 + 1; printGraph(graph); assertEquals(newCount, getNodeCount(graph)); @@ -93,14 +90,14 @@ public void bundesarchiveTest() throws Exception { for (DocumentaryUnitDescription d : archUnit.getDocumentDescriptions()) { assertEquals("Reichsschatzmeister der NSDAP", d.getName()); } - //test dates + // test dates (support for parsing these was removed, so they're just + // strings now and not DatePeriods) for (DocumentaryUnitDescription d : archUnit.getDocumentDescriptions()) { - // Single date is just a string - assertFalse(d.getPropertyKeys().contains("unitDates")); + // Single date is not parsable as a range or year + String unitDates = d.getProperty("unitDates"); + assertNotNull(unitDates); List datePeriods = Lists.newArrayList(d.getDatePeriods()); - assertEquals(1, datePeriods.size()); - assertEquals("1906-01-01", datePeriods.get(0).getStartDate()); - assertEquals("1919-12-31", datePeriods.get(0).getEndDate()); + assertEquals(0, datePeriods.size()); } } } diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveVcTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveVcTest.java index 92ae9f0bc..cb2cea9d8 100644 --- a/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveVcTest.java +++ b/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveVcTest.java @@ -62,13 +62,13 @@ public void bundesarchiveTest() throws Exception { // How many new nodes will have been created? We should have // - 1 more DocumentaryUnits (archdesc) // - 1 more DocumentDescription - // - 1 more DatePeriod + // - 0 more DatePeriods // - 1 more UnknownProperties // - 3 more Relationships // - 2 more import Event links (1 for every Unit, 1 for the User) // - 1 more import Event // - 5 more MaintenanceEvents (4 revised, 1 created) - int newCount = origCount + 15; + int newCount = origCount + 14; assertEquals(newCount, getNodeCount(graph)); diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/ead/StadsarchiefAdamTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/ead/StadsarchiefAdamTest.java index 437a5db04..503e6cafe 100644 --- a/ehri-io/src/test/java/eu/ehri/project/importers/ead/StadsarchiefAdamTest.java +++ b/ehri-io/src/test/java/eu/ehri/project/importers/ead/StadsarchiefAdamTest.java @@ -65,12 +65,12 @@ public void niodEadTest() throws Exception { // How many new nodes will have been created? We should have // - 6 more DocumentaryUnits (archdesc, 5 children) // - 6 more DocumentDescription - // - 1 more DatePeriod + // - 4 more DatePeriod // - 6 more UnknownProperties // - 7 more import Event links (6 for every Unit, 1 for the User) // - 1 more import Event // - 18 more MaintenanceEvents - int newCount = origCount + 45; + int newCount = origCount + 49; assertEquals(newCount, getNodeCount(graph)); DocumentaryUnit archdesc = graph.frame( diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/ead/Wp2JmpEadTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/ead/Wp2JmpEadTest.java index 94faedfa0..deda83a81 100644 --- a/ehri-io/src/test/java/eu/ehri/project/importers/ead/Wp2JmpEadTest.java +++ b/ehri-io/src/test/java/eu/ehri/project/importers/ead/Wp2JmpEadTest.java @@ -67,14 +67,14 @@ public void testImportItems() throws Exception { // How many new nodes will have been created? We should have // - 7 more DocumentaryUnits fonds C1 C2 C3 4,5,6 // - 7 more DocumentDescription - // - 0 more DatePeriod 0 0 1 + // - 1 more DatePeriod 0 0 1 // - 3 UndeterminedRelationship, 0 0 0 11 // - 8 more import Event links (4 for every Unit, 1 for the User) // - 1 more import Event // - 0 Annotation as resolved relationship // - 1 unknownProperty - int newCount = count + 27; + int newCount = count + 28; assertEquals(newCount, getNodeCount(graph)); Iterable docs = graph.getVertices(Ontology.IDENTIFIER_KEY, FONDS); diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/ead/YadVashemTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/ead/YadVashemTest.java index 37b3d6aec..c5acd3417 100644 --- a/ehri-io/src/test/java/eu/ehri/project/importers/ead/YadVashemTest.java +++ b/ehri-io/src/test/java/eu/ehri/project/importers/ead/YadVashemTest.java @@ -99,10 +99,10 @@ public void testWithExistingDescription() throws Exception { * documentDescription: 3 * maintenance event: 3 * systemEvent: 1 - * datePeriod: 1 + * datePeriod: 3 */ - assertEquals(count + 18, getNodeCount(graph)); + assertEquals(count + 20, getNodeCount(graph)); assertEquals(2, toList(m19.getDocumentDescriptions()).size()); for (DocumentaryUnitDescription desc : m19.getDocumentDescriptions()) { logger.debug("Document description graph ID: {}", desc.getId()); @@ -142,9 +142,9 @@ public void testImportItems() throws Exception { * maintenance event: 3 * property: 1 * systemEvent: 1 - * datePeriod: 1 + * datePeriod: 3 */ - assertEquals(count + 20, getNodeCount(graph)); + assertEquals(count + 22, getNodeCount(graph)); //ENG also imported: assertEquals(2, toList(m19.getDocumentDescriptions()).size()); DocumentaryUnit c1 = graph.frame(getVertexByIdentifier(graph, C1), DocumentaryUnit.class); diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/util/DateParserTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/util/DateParserTest.java index 90087688d..168a817e7 100644 --- a/ehri-io/src/test/java/eu/ehri/project/importers/util/DateParserTest.java +++ b/ehri-io/src/test/java/eu/ehri/project/importers/util/DateParserTest.java @@ -5,23 +5,23 @@ import com.google.common.collect.Maps; import eu.ehri.project.definitions.Entities; import eu.ehri.project.definitions.Ontology; -import eu.ehri.project.models.EntityClass; import org.junit.Before; import org.junit.Test; import java.util.List; import java.util.Map; -import static eu.ehri.project.importers.util.DateParser.normaliseDate; import static org.junit.Assert.*; public class DateParserTest { private Map mapWithOneParseableDate; private Map mapWithMultipleDates; private Map mapWithMultipleDatesAsList; + private DateParser parser; @Before public void init() { + parser = new DateParser(); mapWithOneParseableDate = Maps.newHashMap(); mapWithOneParseableDate.put("unitDates", "1934/1936"); @@ -47,7 +47,7 @@ public void init() { @Test public void extractDatesFromDateProperty() { - List> extractedDates = ImportHelpers.extractDates(mapWithOneParseableDate); + List> extractedDates = parser.extractDates(mapWithOneParseableDate); assertEquals(1, extractedDates.size()); assertEquals("1934/1936", extractedDates.get(0).get(Ontology.DATE_HAS_DESCRIPTION)); } @@ -55,7 +55,7 @@ public void extractDatesFromDateProperty() { @Test public void removeDateFromDateProperty() { assertTrue(mapWithOneParseableDate.containsKey("unitDates")); - ImportHelpers.extractDates(mapWithOneParseableDate); + parser.extractDates(mapWithOneParseableDate); assertFalse(mapWithOneParseableDate.containsKey("unitDates")); } @@ -65,7 +65,7 @@ public void removeDatesFromDateProperty() { assertTrue(mapWithMultipleDates.containsKey("existDate")); assertTrue(mapWithMultipleDates.containsKey("unitDates")); assertTrue(mapWithMultipleDates.containsKey(Entities.DATE_PERIOD)); - List> dates = ImportHelpers.extractDates(mapWithMultipleDates); + List> dates = parser.extractDates(mapWithMultipleDates); assertEquals(3, dates.size()); assertFalse(mapWithMultipleDates.containsKey(Entities.DATE_PERIOD)); assertEquals("summer 1978", mapWithMultipleDates.get("unitDates")); @@ -75,27 +75,7 @@ public void removeDatesFromDateProperty() { @Test public void removeDatesFromDatePropertyList() { assertTrue(mapWithMultipleDatesAsList.containsKey("unitDates")); - ImportHelpers.extractDates(mapWithMultipleDatesAsList); + parser.extractDates(mapWithMultipleDatesAsList); assertFalse(mapWithMultipleDatesAsList.containsKey("unitDates")); } - - @Test - public void beginDateYear() { - assertEquals("1944-01-01", normaliseDate("1944")); - } - - @Test - public void beginDateYearMonth() { - assertEquals("1944-01-01", normaliseDate("1944-01")); - } - - @Test - public void endDateYear() { - assertEquals("1944-12-31", normaliseDate("1944", true)); - } - - @Test - public void endDateYearMonth() { - assertEquals("1944-01-31", normaliseDate("1944-01", true)); - } } \ No newline at end of file diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/util/DateRangeParserTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/util/DateRangeParserTest.java new file mode 100644 index 000000000..c34be762c --- /dev/null +++ b/ehri-io/src/test/java/eu/ehri/project/importers/util/DateRangeParserTest.java @@ -0,0 +1,37 @@ +package eu.ehri.project.importers.util; + +import com.google.common.collect.ImmutableList; +import org.apache.commons.compress.utils.Lists; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.util.Calendar; +import java.util.List; +import java.util.Optional; + +import static org.junit.Assert.*; + +public class DateRangeParserTest { + + private DateRangeParser rangeParser; + + @Before + public void setUp() throws Exception { + rangeParser = new DateRangeParser(); + } + + @Test + public void parse() { + check("1939 ca.", "1939-01-01 - 1939-12-31"); + check("1939 - 1942", "1939-01-01 - 1942-12-31"); + check("01/1942 - 06/1942", "1942-01-01 - 1942-06-30"); + check("1935-03/1935-05", "1935-03-01 - 1935-05-31"); + check("summer 1940", "1940-01-01 - 1940-12-31"); // Meh? good enough + check("1939-01-01 - 1942-12-31", "1939-01-01 - 1942-12-31"); + } + + private void check(String sloppy, String canonical) { + assertEquals(Optional.of(DateRange.fromString(canonical, sloppy)), rangeParser.parse(sloppy)); + } +} \ No newline at end of file diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/util/DateRangeTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/util/DateRangeTest.java new file mode 100644 index 000000000..35510b826 --- /dev/null +++ b/ehri-io/src/test/java/eu/ehri/project/importers/util/DateRangeTest.java @@ -0,0 +1,16 @@ +package eu.ehri.project.importers.util; + +import org.junit.Test; + +import java.time.LocalDate; + +import static org.junit.Assert.assertEquals; + +public class DateRangeTest { + + @Test + public void fromString() { + assertEquals(DateRange.fromString("2020-01-01", ""), + new DateRange(LocalDate.parse("2020-01-01"), "")); + } +} \ No newline at end of file