Skip to content

Commit

Permalink
Use a library for sloppy date parsing
Browse files Browse the repository at this point in the history
Remove support for dangerous Bundesarchiv date range format (1906/08,
meaning the years) and add support for e.g. `1933 ca.` and
`1939/09 - 1945/05`. Refactor some confusing parsing code (and intro-
duce some more of our own for ranges.)
  • Loading branch information
mikesname committed Jun 12, 2023
1 parent 9bb7d75 commit 919bec9
Show file tree
Hide file tree
Showing 18 changed files with 294 additions and 177 deletions.
7 changes: 7 additions & 0 deletions ehri-io/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,13 @@
<version>1.4.1</version>
</dependency>

<!-- Date parsing -->
<dependency>
<groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId>
<version>1.0.11</version>
</dependency>

<!-- Jackson CSV -->
<dependency>
<groupId>com.fasterxml.jackson.dataformat</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@
import eu.ehri.project.importers.ImportCallback;
import eu.ehri.project.importers.ImportLog;
import eu.ehri.project.importers.ImportOptions;
import eu.ehri.project.importers.links.LinkResolver;
import eu.ehri.project.importers.util.DateParser;
import eu.ehri.project.models.base.Accessible;
import eu.ehri.project.models.base.Accessor;
import eu.ehri.project.models.base.Actioner;
import eu.ehri.project.models.base.PermissionScope;
import eu.ehri.project.persistence.BundleManager;
Expand All @@ -44,6 +47,8 @@ public abstract class AbstractImporter<I, T extends Accessible> implements ItemI
protected final GraphManager manager;
protected final ImportOptions options;
protected final ImportLog log;
protected final DateParser dateParser;
protected final LinkResolver linkResolver;
private final List<ImportCallback> callbacks = Lists.newArrayList();
private final List<ErrorCallback> errorCallbacks = Lists.newArrayList();

Expand Down Expand Up @@ -91,6 +96,8 @@ public AbstractImporter(FramedGraph<?> graph, PermissionScope scope, Actioner ac
this.log = log;
this.options = options;
manager = GraphManagerFactory.getInstance(graph);
linkResolver = new LinkResolver(graph, actioner.as(Accessor.class));
dateParser = new DateParser();
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,10 @@
import eu.ehri.project.importers.ImportLog;
import eu.ehri.project.importers.ImportOptions;
import eu.ehri.project.importers.base.AbstractImporter;
import eu.ehri.project.importers.links.LinkResolver;
import eu.ehri.project.importers.util.ImportHelpers;
import eu.ehri.project.models.AccessPointType;
import eu.ehri.project.models.EntityClass;
import eu.ehri.project.models.HistoricalAgent;
import eu.ehri.project.models.base.Accessor;
import eu.ehri.project.models.base.Actioner;
import eu.ehri.project.models.base.Description;
import eu.ehri.project.models.base.PermissionScope;
Expand All @@ -57,8 +55,6 @@ public class EacImporter extends AbstractImporter<Map<String, Object>, Historica
private static final String REL_TYPE = "type";
private static final String REL_NAME = "name";

private final LinkResolver linkResolver;

/**
* Construct an EacImporter object.
*
Expand All @@ -70,7 +66,6 @@ public class EacImporter extends AbstractImporter<Map<String, Object>, Historica
*/
public EacImporter(FramedGraph<?> graph, PermissionScope permissionScope, Actioner actioner, ImportOptions options, ImportLog log) {
super(graph, permissionScope, actioner, options, log);
linkResolver = new LinkResolver(graph, actioner.as(Accessor.class));
}

@Override
Expand All @@ -93,7 +88,7 @@ public HistoricalAgent importItem(Map<String, Object> itemData) throws Validatio

// Add dates and descriptions to the bundle since they're @Dependent
// relations.
for (Map<String, Object> dpb : ImportHelpers.extractDates(itemData)) {
for (Map<String, Object> dpb : dateParser.extractDates(itemData)) {
descBundle = descBundle.withRelation(Ontology.ENTITY_HAS_DATE, Bundle.of(EntityClass.DATE_PERIOD, dpb));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,15 @@
import eu.ehri.project.importers.ImportLog;
import eu.ehri.project.importers.ImportOptions;
import eu.ehri.project.importers.base.AbstractImporter;
import eu.ehri.project.importers.links.LinkResolver;
import eu.ehri.project.importers.util.ImportHelpers;
import eu.ehri.project.models.AccessPointType;
import eu.ehri.project.models.DocumentaryUnit;
import eu.ehri.project.models.EntityClass;
import eu.ehri.project.models.Repository;
import eu.ehri.project.models.base.AbstractUnit;
import eu.ehri.project.models.base.Accessor;
import eu.ehri.project.models.base.Actioner;
import eu.ehri.project.models.base.PermissionScope;
import eu.ehri.project.persistence.Bundle;
import eu.ehri.project.persistence.BundleManager;
import eu.ehri.project.persistence.Messages;
import eu.ehri.project.persistence.Mutation;
import eu.ehri.project.persistence.Serializer;
import eu.ehri.project.persistence.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -67,7 +61,6 @@ public class EadImporter extends AbstractImporter<Map<String, Object>, AbstractU
//the EadImporter can import ead as DocumentaryUnits, the default, or overwrite those and create VirtualUnits instead.
private final EntityClass unitEntity = EntityClass.DOCUMENTARY_UNIT;
private final Serializer mergeSerializer;
private final LinkResolver linkResolver;

public static final String ACCESS_POINT = "AccessPoint";

Expand All @@ -83,8 +76,6 @@ public class EadImporter extends AbstractImporter<Map<String, Object>, AbstractU
public EadImporter(FramedGraph<?> graph, PermissionScope permissionScope, Actioner actioner, ImportOptions options, ImportLog log) {
super(graph, permissionScope, actioner, options, log);
mergeSerializer = new Serializer.Builder(graph).dependentOnly().build();
linkResolver = new LinkResolver(graph, actioner.as(Accessor.class));

}

/**
Expand Down Expand Up @@ -150,7 +141,7 @@ public AbstractUnit importItem(Map<String, Object> itemData, List<String> idPath
* @throws ValidationError when data constraints are not met
*/
protected Bundle getDescription(Map<String, Object> itemData) throws ValidationError {
List<Map<String, Object>> extractedDates = ImportHelpers.extractDates(itemData);
List<Map<String, Object>> extractedDates = dateParser.extractDates(itemData);

Map<String, Object> raw = ImportHelpers.extractDescription(itemData, EntityClass.DOCUMENTARY_UNIT_DESCRIPTION);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ public Repository importItem(Map<String, Object> itemData) throws ValidationErro

// Add dates and descriptions to the bundle since they're @Dependent
// relations.
for (Map<String, Object> dpb : ImportHelpers.extractDates(itemData)) {
for (Map<String, Object> dpb : dateParser.extractDates(itemData)) {
descBundle = descBundle.withRelation(Ontology.ENTITY_HAS_DATE, Bundle.of(EntityClass.DATE_PERIOD, dpb));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ public class SaxImportManager extends AbstractImportManager {

private static final Logger logger = LoggerFactory.getLogger(SaxImportManager.class);

private static final Config config = ConfigFactory.load();
private final Class<? extends SaxXmlHandler> handlerClass;
private final ImportOptions options;
private final List<ImportCallback> extraCallbacks;
Expand Down
108 changes: 10 additions & 98 deletions ehri-io/src/main/java/eu/ehri/project/importers/util/DateParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,17 @@
import eu.ehri.project.definitions.Entities;
import eu.ehri.project.definitions.Ontology;
import eu.ehri.project.importers.properties.XmlImportProperties;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static eu.ehri.project.importers.util.ImportHelpers.getSubNode;

/**
* This class contains static functions to extract date information from
* largely unstructured maps.
* Class for extracting date info from unstructured or semi-structured data and text.
*
* There are two main scenarios:
*
Expand All @@ -37,43 +28,15 @@
* Notable, the function that returns the dates removes the data from
* which they were extracted
*/
class DateParser {
public class DateParser {

private static final Logger logger = LoggerFactory.getLogger(DateParser.class);

// Various date patterns
private static final Pattern[] datePatterns = {
// Yad Vashem, ICA-Atom style: 1924-1-1 - 1947-12-31
// Yad Vashem in Wp2: 12-15-1941, 9-30-1944
Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})\\s?-\\s?(\\d{4}-\\d{1,2}-\\d{1,2})$"),
Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})$"),
Pattern.compile("^(\\d{4})\\s?-\\s?(\\d{4})$"),
Pattern.compile("^(\\d{4})-\\[(\\d{4})\\]$"),
Pattern.compile("^(\\d{4})-\\[(\\d{4})\\]$"),
Pattern.compile("^(\\d{4}s)-\\[(\\d{4}s)\\]$"),
Pattern.compile("^\\[(\\d{4})\\]$"),
Pattern.compile("^(\\d{4})$"),
Pattern.compile("^(\\d{2})th century$"),
Pattern.compile("^\\s*(\\d{4})\\s*-\\s*(\\d{4})"),
//bundesarchive: 1906/19
Pattern.compile("^\\s*(\\d{4})/(\\d{2})"),
Pattern.compile("^\\s*(\\d{4})\\s*/\\s*(\\d{4})"),
Pattern.compile("^(\\d{4}-\\d{1,2})/(\\d{4}-\\d{1,2})"),
Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})/(\\d{4}-\\d{1,2}-\\d{1,2})"),
Pattern.compile("^(\\d{4})/(\\d{4}-\\d{1,2}-\\d{1,2})")
};

// NB: Using English locale here to avoid ambiguities caused by system dependent
// time zones such as: Cannot parse "1940-05-16": Illegal instant due to time zone
// offset transition (Europe/Amsterdam)
// https://en.wikipedia.org/wiki/UTC%2B00:20
private static final DateTimeFormatter isoDateTimeFormat = ISODateTimeFormat.date().withLocale(Locale.ENGLISH);

// NB: Not static yet since these objects aren't thread safe :(
private static final SimpleDateFormat yearMonthDateFormat = new SimpleDateFormat("yyyy-MM");
private static final SimpleDateFormat yearDateFormat = new SimpleDateFormat("yyyy");
private static final XmlImportProperties dates = new XmlImportProperties("dates.properties");
private final DateRangeParser rangeParser;

public DateParser() {
rangeParser = new DateRangeParser();
}

/**
* Extract a set of dates from input data. The input data is mutated to
Expand All @@ -82,7 +45,7 @@ class DateParser {
* @param data a map of input data
* @return a list of parsed date period maps
*/
static List<Map<String, Object>> extractDates(Map<String, Object> data) {
public List<Map<String, Object>> extractDates(Map<String, Object> data) {
List<Map<String, Object>> extractedDates = Lists.newArrayList();

if (data.containsKey(Entities.DATE_PERIOD)) {
Expand All @@ -108,7 +71,7 @@ static List<Map<String, Object>> extractDates(Map<String, Object> data) {
return extractedDates;
}

private static void replaceDates(Map<String, Object> data, List<Map<String, Object>> extractedDates, Map<String, String> dateValues) {
private void replaceDates(Map<String, Object> data, List<Map<String, Object>> extractedDates, Map<String, String> dateValues) {
Map<String, String> dateTypes = Maps.newHashMap();
for (String dateValue : dateValues.keySet()) {
dateTypes.put(dateValues.get(dateValue), null);
Expand All @@ -134,23 +97,8 @@ private static void replaceDates(Map<String, Object> data, List<Map<String, Obje
}
}

private static Optional<Map<String, Object>> extractDate(String date) {
Map<String, Object> data = matchDate(date);
return data.isEmpty() ? Optional.empty() : Optional.of(data);
}

private static Map<String, Object> matchDate(String date) {
Map<String, Object> data = Maps.newHashMap();
for (Pattern re : datePatterns) {
Matcher matcher = re.matcher(date);
if (matcher.matches()) {
data.put(Ontology.DATE_PERIOD_START_DATE, normaliseDate(matcher.group(1)));
data.put(Ontology.DATE_PERIOD_END_DATE, normaliseDate(matcher.group(matcher.groupCount() > 1 ? 2 : 1), true));
data.put(Ontology.DATE_HAS_DESCRIPTION, date);
break;
}
}
return data;
private Optional<Map<String, Object>> extractDate(String date) {
return rangeParser.parse(date).map(DateRange::data);
}

private static Map<String, String> returnDatesAsString(Map<String, Object> data) {
Expand All @@ -172,40 +120,4 @@ private static Map<String, String> returnDatesAsString(Map<String, Object> data)
}
return datesAsString;
}

static String normaliseDate(String date) {
return normaliseDate(date, false);
}

/**
* Normalise a date in a string.
*
* @param date a String date that needs formatting
* @param endOfPeriod a string signifying whether this date is the begin of
* a period or the end of a period
* @return a String containing the formatted date.
*/
static String normaliseDate(String date, boolean endOfPeriod) {
String returnDate = isoDateTimeFormat.print(DateTime.parse(date));
if (returnDate.startsWith("00")) {
returnDate = "19" + returnDate.substring(2);
date = "19" + date;
}
if (endOfPeriod) {
if (!date.equals(returnDate)) {
ParsePosition p = new ParsePosition(0);
yearMonthDateFormat.parse(date, p);
if (p.getIndex() > 0) {
returnDate = isoDateTimeFormat.print(DateTime.parse(date).plusMonths(1).minusDays(1));
} else {
p = new ParsePosition(0);
yearDateFormat.parse(date, p);
if (p.getIndex() > 0) {
returnDate = isoDateTimeFormat.print(DateTime.parse(date).plusYears(1).minusDays(1));
}
}
}
}
return returnDate;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package eu.ehri.project.importers.util;

import eu.ehri.project.definitions.Ontology;
import org.apache.jena.ext.com.google.common.collect.Maps;

import java.time.*;
import java.time.format.DateTimeFormatter;
import java.util.Map;
import java.util.Objects;

public class DateRange {

private static final DateTimeFormatter isoDateFormat = DateTimeFormatter.ISO_LOCAL_DATE;

private final LocalDate start;
private final LocalDate end;
private final String description;

public DateRange(LocalDate start, LocalDate end, String description) {
if (start == null) {
throw new IllegalArgumentException("DateRange start must not be null");
}
this.start = start;
this.end = end;
this.description = description;
}

public static DateRange of(LocalDate start, LocalDate end, String description) {
return new DateRange(start, end, description);
}

public DateRange(LocalDate start, String description) {
this(start, null, description);
}

@Override
public String toString() {
return end != null
? String.format("%s - %s", toLocalDateString(start), toLocalDateString(end))
: toLocalDateString(start);
}

/**
* Debug constructor: creates a DateRange from a "YYYY-MM-DD - YYYY-MM-DD"
* string.
*/
public static DateRange fromString(String s, String description) {
final String[] split = s.split("\\s-\\s");
final LocalDate d1 = LocalDate.parse(split[0]);
final LocalDate d2 = split.length > 1 ? LocalDate.parse(split[1]) : null;
return new DateRange(d1, d2, description);
}

public Map<String, Object> data() {
Map<String, Object> data = Maps.newHashMap();
data.put(Ontology.DATE_PERIOD_START_DATE, toLocalDateString(start));
if (end != null) {
data.put(Ontology.DATE_PERIOD_END_DATE, toLocalDateString(end));
}
if (description != null) {
data.put(Ontology.DATE_HAS_DESCRIPTION, description);
}
return data;
}

private String toLocalDateString(LocalDate instant) {
return instant.format(isoDateFormat);
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
DateRange dateRange = (DateRange) o;
return start.equals(dateRange.start)
&& Objects.equals(end, dateRange.end)
&& description.equals(dateRange.description);
}

@Override
public int hashCode() {
return Objects.hash(start, end, description);
}
}
Loading

0 comments on commit 919bec9

Please sign in to comment.