From 2d48bd2d6345b1354bc7a18884fc0202f450aff9 Mon Sep 17 00:00:00 2001 From: Mike Bryant Date: Fri, 3 Mar 2023 12:32:48 +0000 Subject: [PATCH] WIP: use a library for sloppy date parsing. Remove support for dangerous Bundesarchiv date range format (1906/08, meaning the years) and add support for e.g. `1933 ca.` and `1939/09 - 1945/05`. More refactoring and tests are needed here. --- ehri-io/pom.xml | 7 ++ .../project/importers/ead/EadImporter.java | 5 +- .../importers/managers/SaxImportManager.java | 1 - .../project/importers/util/DateParser.java | 108 ++---------------- .../project/importers/util/DateRange.java | 59 ++++++++++ .../importers/util/DateRangeParser.java | 90 +++++++++++++++ .../project/importers/util/ImportHelpers.java | 3 +- .../importers/ead/BundesarchiveSplitTest.java | 21 ++-- .../importers/ead/BundesarchiveVcTest.java | 4 +- .../importers/ead/StadsarchiefAdamTest.java | 4 +- .../project/importers/ead/Wp2JmpEadTest.java | 4 +- .../project/importers/ead/YadVashemTest.java | 8 +- .../importers/util/DateParserTest.java | 22 +--- .../importers/util/DateRangeParserTest.java | 40 +++++++ 14 files changed, 232 insertions(+), 144 deletions(-) create mode 100644 ehri-io/src/main/java/eu/ehri/project/importers/util/DateRange.java create mode 100644 ehri-io/src/main/java/eu/ehri/project/importers/util/DateRangeParser.java create mode 100644 ehri-io/src/test/java/eu/ehri/project/importers/util/DateRangeParserTest.java diff --git a/ehri-io/pom.xml b/ehri-io/pom.xml index e93990d0d..1bfb2a3ba 100644 --- a/ehri-io/pom.xml +++ b/ehri-io/pom.xml @@ -102,6 +102,13 @@ 1.4.1 + + + com.github.sisyphsu + dateparser + 1.0.11 + + com.fasterxml.jackson.dataformat diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/ead/EadImporter.java b/ehri-io/src/main/java/eu/ehri/project/importers/ead/EadImporter.java index 7155a710a..4f24dc790 100644 --- a/ehri-io/src/main/java/eu/ehri/project/importers/ead/EadImporter.java +++ b/ehri-io/src/main/java/eu/ehri/project/importers/ead/EadImporter.java @@ -31,6 +31,7 @@ import eu.ehri.project.importers.ImportOptions; import eu.ehri.project.importers.base.AbstractImporter; import eu.ehri.project.importers.links.LinkResolver; +import eu.ehri.project.importers.util.DateParser; import eu.ehri.project.importers.util.ImportHelpers; import eu.ehri.project.models.AccessPointType; import eu.ehri.project.models.DocumentaryUnit; @@ -68,6 +69,7 @@ public class EadImporter extends AbstractImporter, AbstractU private final EntityClass unitEntity = EntityClass.DOCUMENTARY_UNIT; private final Serializer mergeSerializer; private final LinkResolver linkResolver; + private final DateParser dateParser; public static final String ACCESS_POINT = "AccessPoint"; @@ -84,6 +86,7 @@ public EadImporter(FramedGraph graph, PermissionScope permissionScope, Action super(graph, permissionScope, actioner, options, log); mergeSerializer = new Serializer.Builder(graph).dependentOnly().build(); linkResolver = new LinkResolver(graph, actioner.as(Accessor.class)); + dateParser = new DateParser(); } @@ -150,7 +153,7 @@ public AbstractUnit importItem(Map itemData, List idPath * @throws ValidationError when data constraints are not met */ protected Bundle getDescription(Map itemData) throws ValidationError { - List> extractedDates = ImportHelpers.extractDates(itemData); + List> extractedDates = dateParser.extractDates(itemData); Map raw = ImportHelpers.extractDescription(itemData, EntityClass.DOCUMENTARY_UNIT_DESCRIPTION); diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/managers/SaxImportManager.java b/ehri-io/src/main/java/eu/ehri/project/importers/managers/SaxImportManager.java index 50c9d5ef0..5c7ebc923 100644 --- a/ehri-io/src/main/java/eu/ehri/project/importers/managers/SaxImportManager.java +++ b/ehri-io/src/main/java/eu/ehri/project/importers/managers/SaxImportManager.java @@ -54,7 +54,6 @@ public class SaxImportManager extends AbstractImportManager { private static final Logger logger = LoggerFactory.getLogger(SaxImportManager.class); - private static final Config config = ConfigFactory.load(); private final Class handlerClass; private final ImportOptions options; private final List extraCallbacks; diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/util/DateParser.java b/ehri-io/src/main/java/eu/ehri/project/importers/util/DateParser.java index 935ca5aad..1fc8d42ce 100644 --- a/ehri-io/src/main/java/eu/ehri/project/importers/util/DateParser.java +++ b/ehri-io/src/main/java/eu/ehri/project/importers/util/DateParser.java @@ -5,26 +5,17 @@ import eu.ehri.project.definitions.Entities; import eu.ehri.project.definitions.Ontology; import eu.ehri.project.importers.properties.XmlImportProperties; -import org.joda.time.DateTime; -import org.joda.time.format.DateTimeFormatter; -import org.joda.time.format.ISODateTimeFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.text.ParsePosition; -import java.text.SimpleDateFormat; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Optional; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import static eu.ehri.project.importers.util.ImportHelpers.getSubNode; /** - * This class contains static functions to extract date information from - * largely unstructured maps. + * Class for extracting date info from unstructured or semi-structured data and text. * * There are two main scenarios: * @@ -37,43 +28,15 @@ * Notable, the function that returns the dates removes the data from * which they were extracted */ -class DateParser { +public class DateParser { private static final Logger logger = LoggerFactory.getLogger(DateParser.class); - - // Various date patterns - private static final Pattern[] datePatterns = { - // Yad Vashem, ICA-Atom style: 1924-1-1 - 1947-12-31 - // Yad Vashem in Wp2: 12-15-1941, 9-30-1944 - Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})\\s?-\\s?(\\d{4}-\\d{1,2}-\\d{1,2})$"), - Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})$"), - Pattern.compile("^(\\d{4})\\s?-\\s?(\\d{4})$"), - Pattern.compile("^(\\d{4})-\\[(\\d{4})\\]$"), - Pattern.compile("^(\\d{4})-\\[(\\d{4})\\]$"), - Pattern.compile("^(\\d{4}s)-\\[(\\d{4}s)\\]$"), - Pattern.compile("^\\[(\\d{4})\\]$"), - Pattern.compile("^(\\d{4})$"), - Pattern.compile("^(\\d{2})th century$"), - Pattern.compile("^\\s*(\\d{4})\\s*-\\s*(\\d{4})"), - //bundesarchive: 1906/19 - Pattern.compile("^\\s*(\\d{4})/(\\d{2})"), - Pattern.compile("^\\s*(\\d{4})\\s*/\\s*(\\d{4})"), - Pattern.compile("^(\\d{4}-\\d{1,2})/(\\d{4}-\\d{1,2})"), - Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})/(\\d{4}-\\d{1,2}-\\d{1,2})"), - Pattern.compile("^(\\d{4})/(\\d{4}-\\d{1,2}-\\d{1,2})") - }; - - // NB: Using English locale here to avoid ambiguities caused by system dependent - // time zones such as: Cannot parse "1940-05-16": Illegal instant due to time zone - // offset transition (Europe/Amsterdam) - // https://en.wikipedia.org/wiki/UTC%2B00:20 - private static final DateTimeFormatter isoDateTimeFormat = ISODateTimeFormat.date().withLocale(Locale.ENGLISH); - - // NB: Not static yet since these objects aren't thread safe :( - private static final SimpleDateFormat yearMonthDateFormat = new SimpleDateFormat("yyyy-MM"); - private static final SimpleDateFormat yearDateFormat = new SimpleDateFormat("yyyy"); private static final XmlImportProperties dates = new XmlImportProperties("dates.properties"); + private final DateRangeParser rangeParser; + public DateParser() { + rangeParser = new DateRangeParser(); + } /** * Extract a set of dates from input data. The input data is mutated to @@ -82,7 +45,7 @@ class DateParser { * @param data a map of input data * @return a list of parsed date period maps */ - static List> extractDates(Map data) { + public List> extractDates(Map data) { List> extractedDates = Lists.newArrayList(); if (data.containsKey(Entities.DATE_PERIOD)) { @@ -108,7 +71,7 @@ static List> extractDates(Map data) { return extractedDates; } - private static void replaceDates(Map data, List> extractedDates, Map dateValues) { + private void replaceDates(Map data, List> extractedDates, Map dateValues) { Map dateTypes = Maps.newHashMap(); for (String dateValue : dateValues.keySet()) { dateTypes.put(dateValues.get(dateValue), null); @@ -134,23 +97,8 @@ private static void replaceDates(Map data, List> extractDate(String date) { - Map data = matchDate(date); - return data.isEmpty() ? Optional.empty() : Optional.of(data); - } - - private static Map matchDate(String date) { - Map data = Maps.newHashMap(); - for (Pattern re : datePatterns) { - Matcher matcher = re.matcher(date); - if (matcher.matches()) { - data.put(Ontology.DATE_PERIOD_START_DATE, normaliseDate(matcher.group(1))); - data.put(Ontology.DATE_PERIOD_END_DATE, normaliseDate(matcher.group(matcher.groupCount() > 1 ? 2 : 1), true)); - data.put(Ontology.DATE_HAS_DESCRIPTION, date); - break; - } - } - return data; + private Optional> extractDate(String date) { + return rangeParser.parse(date).map(DateRange::data); } private static Map returnDatesAsString(Map data) { @@ -172,40 +120,4 @@ private static Map returnDatesAsString(Map data) } return datesAsString; } - - static String normaliseDate(String date) { - return normaliseDate(date, false); - } - - /** - * Normalise a date in a string. - * - * @param date a String date that needs formatting - * @param endOfPeriod a string signifying whether this date is the begin of - * a period or the end of a period - * @return a String containing the formatted date. - */ - static String normaliseDate(String date, boolean endOfPeriod) { - String returnDate = isoDateTimeFormat.print(DateTime.parse(date)); - if (returnDate.startsWith("00")) { - returnDate = "19" + returnDate.substring(2); - date = "19" + date; - } - if (endOfPeriod) { - if (!date.equals(returnDate)) { - ParsePosition p = new ParsePosition(0); - yearMonthDateFormat.parse(date, p); - if (p.getIndex() > 0) { - returnDate = isoDateTimeFormat.print(DateTime.parse(date).plusMonths(1).minusDays(1)); - } else { - p = new ParsePosition(0); - yearDateFormat.parse(date, p); - if (p.getIndex() > 0) { - returnDate = isoDateTimeFormat.print(DateTime.parse(date).plusYears(1).minusDays(1)); - } - } - } - } - return returnDate; - } } diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/util/DateRange.java b/ehri-io/src/main/java/eu/ehri/project/importers/util/DateRange.java new file mode 100644 index 000000000..7c9ded776 --- /dev/null +++ b/ehri-io/src/main/java/eu/ehri/project/importers/util/DateRange.java @@ -0,0 +1,59 @@ +package eu.ehri.project.importers.util; + +import eu.ehri.project.definitions.Ontology; +import org.apache.jena.ext.com.google.common.collect.Maps; + +import java.time.Instant; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.util.Map; + +public class DateRange { + + private static final DateTimeFormatter isoDateFormat = DateTimeFormatter.ISO_LOCAL_DATE; + private static final DateTimeFormatter isoDateTimeFormat = DateTimeFormatter.ISO_LOCAL_DATE_TIME; + + private final Instant start; + private final Instant end; + private final String description; + + public DateRange(Instant start, Instant end, String description) { + if (start == null) { + throw new IllegalArgumentException("DateRange start must not be null"); + } + this.start = start; + this.end = end; + this.description = description; + } + + public DateRange(Instant start, String description) { + this(start, null, description); + } + + @Override + public String toString() { + return end != null + ? String.format("%s - %s", toLocalDateString(start), toLocalDateString(end)) + : toLocalDateString(start); + } + + public Map data() { + Map data = Maps.newHashMap(); + data.put(Ontology.DATE_PERIOD_START_DATE, toLocalDateString(start)); + if (end != null) { + data.put(Ontology.DATE_PERIOD_END_DATE, toLocalDateString(end)); + } + if (description != null) { + data.put(Ontology.DATE_HAS_DESCRIPTION, description); + } + return data; + } + + private String toLocalDateString(Instant instant) { + return instant.atZone(ZoneId.systemDefault()).format(isoDateFormat); + } + + private String toLocalDateTimeString(Instant instant) { + return instant.atZone(ZoneId.systemDefault()).format(isoDateTimeFormat); + } +} diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/util/DateRangeParser.java b/ehri-io/src/main/java/eu/ehri/project/importers/util/DateRangeParser.java new file mode 100644 index 000000000..f1da4974f --- /dev/null +++ b/ehri-io/src/main/java/eu/ehri/project/importers/util/DateRangeParser.java @@ -0,0 +1,90 @@ +package eu.ehri.project.importers.util; + +import com.github.sisyphsu.dateparser.DateParser; + +import java.time.Instant; +import java.time.format.DateTimeParseException; +import java.util.Calendar; +import java.util.Date; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class DateRangeParser { + + private static final Pattern yearRange = Pattern.compile("^(?\\d{4})\\s?[\\-/]\\s?(?\\d{4})$"); + + private final DateParser parser; + + public DateRangeParser() { + parser = DateParser.newBuilder() + .addRule("(?\\d{4}) ca\\.?") + .addRule("summer (?\\d{4})") + .addRule("(?\\d{4})[/\\-](?\\d)[/\\-](?\\d)") + .addRule("(?\\d{2})/(?\\d{4})") + .addRule("(?\\d{4})-(?\\d{2})") + .build(); + } + + public static Optional parseDate(String str) { + DateRangeParser parser = new DateRangeParser(); + return parser.parse(str); + } + + private Optional parseRange(String from, String to, String orig) { + + final Calendar d1 = parser.parseCalendar(from); + final Calendar d2 = parser.parseCalendar(to); + + // If we don't have a specific day or month, set these to the appropriate maximum... + if (to.length() == 4) { + d2.set(Calendar.MONTH, d2.getActualMaximum(Calendar.MONTH)); + d2.set(Calendar.DAY_OF_MONTH, d2.getActualMaximum(Calendar.DAY_OF_MONTH)); + } else if (to.replaceAll("\\D", "").length() < 8) { + // FIXME: this heuristic could fail on date with 2-digit year, + // like 201240, which we don't support anyway... + d2.set(Calendar.DAY_OF_MONTH, d2.getActualMaximum(Calendar.DAY_OF_MONTH)); + } + + return Optional.of(new DateRange(d1.toInstant(), d2.toInstant(), orig)); + } + + private Optional parseSingle(String date, String orig) { + final Calendar d = parser.parseCalendar(date); + Instant end = null; + if (date.length() == 4) { + Calendar d2 = (Calendar) d.clone(); + d2.set(Calendar.MONTH, d2.getActualMaximum(Calendar.MONTH)); + d2.set(Calendar.DAY_OF_MONTH, d2.getActualMaximum(Calendar.DAY_OF_MONTH)); + end = d2.toInstant(); + } + return Optional.of(new DateRange(d.toInstant(), end, orig)); + } + + public Optional parse(String str) { + try { + // See if the string matches a year range... + final Matcher matcher = yearRange.matcher(str); + if (matcher.matches()) { + return parseRange(matcher.group("start"), matcher.group("end"), str); + + } else if (str.contains(" - ")) { + final String[] parts = str.split("\\s-\\s"); + return parseRange(parts[0], parts[1], str); + } else if (str.length() > 12 && (str.charAt(str.length() / 2) == '-' || str.charAt(str.length() / 2) == '/')) { + // Heuristics: if a string is longer than 13 chars and the + // middle char is a '-', assume it's a date range... + return parseRange( + str.subSequence(0, str.length() / 2).toString().trim(), + str.subSequence((str.length() / 2) + 1, str.length()).toString().trim(), str); + } else { + // Otherwise, attempt to parse as a single date, or fail... + return parseSingle(str, str); + } + } catch (IllegalArgumentException | DateTimeParseException e) { + // TODO ? anything else? + e.printStackTrace(); + return Optional.empty(); + } + } +} diff --git a/ehri-io/src/main/java/eu/ehri/project/importers/util/ImportHelpers.java b/ehri-io/src/main/java/eu/ehri/project/importers/util/ImportHelpers.java index 38ddd3296..b7250ed91 100644 --- a/ehri-io/src/main/java/eu/ehri/project/importers/util/ImportHelpers.java +++ b/ehri-io/src/main/java/eu/ehri/project/importers/util/ImportHelpers.java @@ -176,7 +176,8 @@ public static Map extractAddress(Map itemData) { * @return a list of entity bundles */ public static List> extractDates(Map data) { - return DateParser.extractDates(data); + DateParser dateParser = new DateParser(); + return dateParser.extractDates(data); } /** diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveSplitTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveSplitTest.java index 953832014..2d8cd82bf 100644 --- a/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveSplitTest.java +++ b/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveSplitTest.java @@ -34,10 +34,7 @@ import java.io.InputStream; import java.util.List; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertTrue; +import static org.junit.Assert.*; public class BundesarchiveSplitTest extends AbstractImporterTest { @@ -66,13 +63,13 @@ public void bundesarchiveTest() throws Exception { // How many new nodes will have been created? We should have // - 1 more DocumentaryUnits (archdesc) // - 1 more DocumentDescription - // - 1 more DatePeriod + // - 0 more DatePeriods // - 1 more UnknownProperties // - 3 more Relationships // - 2 more import Event links (1 for every Unit, 1 for the User) // - 1 more import Event // - 5 more MaintenanceEvents (4 revised, 1 created) - int newCount = origCount + 9 + 1 + 4 + 1; + int newCount = origCount + 8 + 1 + 4 + 1; printGraph(graph); assertEquals(newCount, getNodeCount(graph)); @@ -93,14 +90,14 @@ public void bundesarchiveTest() throws Exception { for (DocumentaryUnitDescription d : archUnit.getDocumentDescriptions()) { assertEquals("Reichsschatzmeister der NSDAP", d.getName()); } - //test dates + // test dates (support for parsing these was removed, so they're just + // strings now and not DatePeriods) for (DocumentaryUnitDescription d : archUnit.getDocumentDescriptions()) { - // Single date is just a string - assertFalse(d.getPropertyKeys().contains("unitDates")); + // Single date is not parsable as a range or year + String unitDates = d.getProperty("unitDates"); + assertNotNull(unitDates); List datePeriods = Lists.newArrayList(d.getDatePeriods()); - assertEquals(1, datePeriods.size()); - assertEquals("1906-01-01", datePeriods.get(0).getStartDate()); - assertEquals("1919-12-31", datePeriods.get(0).getEndDate()); + assertEquals(0, datePeriods.size()); } } } diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveVcTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveVcTest.java index 92ae9f0bc..cb2cea9d8 100644 --- a/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveVcTest.java +++ b/ehri-io/src/test/java/eu/ehri/project/importers/ead/BundesarchiveVcTest.java @@ -62,13 +62,13 @@ public void bundesarchiveTest() throws Exception { // How many new nodes will have been created? We should have // - 1 more DocumentaryUnits (archdesc) // - 1 more DocumentDescription - // - 1 more DatePeriod + // - 0 more DatePeriods // - 1 more UnknownProperties // - 3 more Relationships // - 2 more import Event links (1 for every Unit, 1 for the User) // - 1 more import Event // - 5 more MaintenanceEvents (4 revised, 1 created) - int newCount = origCount + 15; + int newCount = origCount + 14; assertEquals(newCount, getNodeCount(graph)); diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/ead/StadsarchiefAdamTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/ead/StadsarchiefAdamTest.java index 437a5db04..503e6cafe 100644 --- a/ehri-io/src/test/java/eu/ehri/project/importers/ead/StadsarchiefAdamTest.java +++ b/ehri-io/src/test/java/eu/ehri/project/importers/ead/StadsarchiefAdamTest.java @@ -65,12 +65,12 @@ public void niodEadTest() throws Exception { // How many new nodes will have been created? We should have // - 6 more DocumentaryUnits (archdesc, 5 children) // - 6 more DocumentDescription - // - 1 more DatePeriod + // - 4 more DatePeriod // - 6 more UnknownProperties // - 7 more import Event links (6 for every Unit, 1 for the User) // - 1 more import Event // - 18 more MaintenanceEvents - int newCount = origCount + 45; + int newCount = origCount + 49; assertEquals(newCount, getNodeCount(graph)); DocumentaryUnit archdesc = graph.frame( diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/ead/Wp2JmpEadTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/ead/Wp2JmpEadTest.java index 94faedfa0..deda83a81 100644 --- a/ehri-io/src/test/java/eu/ehri/project/importers/ead/Wp2JmpEadTest.java +++ b/ehri-io/src/test/java/eu/ehri/project/importers/ead/Wp2JmpEadTest.java @@ -67,14 +67,14 @@ public void testImportItems() throws Exception { // How many new nodes will have been created? We should have // - 7 more DocumentaryUnits fonds C1 C2 C3 4,5,6 // - 7 more DocumentDescription - // - 0 more DatePeriod 0 0 1 + // - 1 more DatePeriod 0 0 1 // - 3 UndeterminedRelationship, 0 0 0 11 // - 8 more import Event links (4 for every Unit, 1 for the User) // - 1 more import Event // - 0 Annotation as resolved relationship // - 1 unknownProperty - int newCount = count + 27; + int newCount = count + 28; assertEquals(newCount, getNodeCount(graph)); Iterable docs = graph.getVertices(Ontology.IDENTIFIER_KEY, FONDS); diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/ead/YadVashemTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/ead/YadVashemTest.java index 37b3d6aec..c5acd3417 100644 --- a/ehri-io/src/test/java/eu/ehri/project/importers/ead/YadVashemTest.java +++ b/ehri-io/src/test/java/eu/ehri/project/importers/ead/YadVashemTest.java @@ -99,10 +99,10 @@ public void testWithExistingDescription() throws Exception { * documentDescription: 3 * maintenance event: 3 * systemEvent: 1 - * datePeriod: 1 + * datePeriod: 3 */ - assertEquals(count + 18, getNodeCount(graph)); + assertEquals(count + 20, getNodeCount(graph)); assertEquals(2, toList(m19.getDocumentDescriptions()).size()); for (DocumentaryUnitDescription desc : m19.getDocumentDescriptions()) { logger.debug("Document description graph ID: {}", desc.getId()); @@ -142,9 +142,9 @@ public void testImportItems() throws Exception { * maintenance event: 3 * property: 1 * systemEvent: 1 - * datePeriod: 1 + * datePeriod: 3 */ - assertEquals(count + 20, getNodeCount(graph)); + assertEquals(count + 22, getNodeCount(graph)); //ENG also imported: assertEquals(2, toList(m19.getDocumentDescriptions()).size()); DocumentaryUnit c1 = graph.frame(getVertexByIdentifier(graph, C1), DocumentaryUnit.class); diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/util/DateParserTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/util/DateParserTest.java index 90087688d..625c4c412 100644 --- a/ehri-io/src/test/java/eu/ehri/project/importers/util/DateParserTest.java +++ b/ehri-io/src/test/java/eu/ehri/project/importers/util/DateParserTest.java @@ -12,7 +12,7 @@ import java.util.List; import java.util.Map; -import static eu.ehri.project.importers.util.DateParser.normaliseDate; +//import static eu.ehri.project.importers.util.DateParser.normaliseDate; import static org.junit.Assert.*; public class DateParserTest { @@ -78,24 +78,4 @@ public void removeDatesFromDatePropertyList() { ImportHelpers.extractDates(mapWithMultipleDatesAsList); assertFalse(mapWithMultipleDatesAsList.containsKey("unitDates")); } - - @Test - public void beginDateYear() { - assertEquals("1944-01-01", normaliseDate("1944")); - } - - @Test - public void beginDateYearMonth() { - assertEquals("1944-01-01", normaliseDate("1944-01")); - } - - @Test - public void endDateYear() { - assertEquals("1944-12-31", normaliseDate("1944", true)); - } - - @Test - public void endDateYearMonth() { - assertEquals("1944-01-31", normaliseDate("1944-01", true)); - } } \ No newline at end of file diff --git a/ehri-io/src/test/java/eu/ehri/project/importers/util/DateRangeParserTest.java b/ehri-io/src/test/java/eu/ehri/project/importers/util/DateRangeParserTest.java new file mode 100644 index 000000000..5c40f4640 --- /dev/null +++ b/ehri-io/src/test/java/eu/ehri/project/importers/util/DateRangeParserTest.java @@ -0,0 +1,40 @@ +package eu.ehri.project.importers.util; + +import com.google.common.collect.ImmutableList; +import org.apache.commons.compress.utils.Lists; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.util.Calendar; +import java.util.List; + +import static org.junit.Assert.*; + +public class DateRangeParserTest { + + private DateRangeParser rangeParser; + + @Before + public void setUp() throws Exception { + rangeParser = new DateRangeParser(); + } + + @After + public void tearDown() throws Exception { + } + + @Test + public void parse() { + List list = ImmutableList.of( + "1939 ca.", + "1939 - 1942", + "01/1942 - 06/1942", + "1935-03/1935-05" + ); + + for (String date : list) { + System.out.println("Date: " + date + " -> " + rangeParser.parse(date)); + } + } +} \ No newline at end of file