Skip to content

Commit

Permalink
WIP: use a library for sloppy date parsing.
Browse files Browse the repository at this point in the history
Remove support for dangerous Bundesarchiv date range format (1906/08,
meaning the years) and add support for e.g. `1933 ca.` and
`1939/09 - 1945/05`. More refactoring and tests are needed here.
  • Loading branch information
mikesname committed Mar 3, 2023
1 parent 5f99ec2 commit 2d48bd2
Show file tree
Hide file tree
Showing 14 changed files with 232 additions and 144 deletions.
7 changes: 7 additions & 0 deletions ehri-io/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,13 @@
<version>1.4.1</version>
</dependency>

<!-- Date parsing -->
<dependency>
<groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId>
<version>1.0.11</version>
</dependency>

<!-- Jackson CSV -->
<dependency>
<groupId>com.fasterxml.jackson.dataformat</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import eu.ehri.project.importers.ImportOptions;
import eu.ehri.project.importers.base.AbstractImporter;
import eu.ehri.project.importers.links.LinkResolver;
import eu.ehri.project.importers.util.DateParser;
import eu.ehri.project.importers.util.ImportHelpers;
import eu.ehri.project.models.AccessPointType;
import eu.ehri.project.models.DocumentaryUnit;
Expand Down Expand Up @@ -68,6 +69,7 @@ public class EadImporter extends AbstractImporter<Map<String, Object>, AbstractU
private final EntityClass unitEntity = EntityClass.DOCUMENTARY_UNIT;
private final Serializer mergeSerializer;
private final LinkResolver linkResolver;
private final DateParser dateParser;

public static final String ACCESS_POINT = "AccessPoint";

Expand All @@ -84,6 +86,7 @@ public EadImporter(FramedGraph<?> graph, PermissionScope permissionScope, Action
super(graph, permissionScope, actioner, options, log);
mergeSerializer = new Serializer.Builder(graph).dependentOnly().build();
linkResolver = new LinkResolver(graph, actioner.as(Accessor.class));
dateParser = new DateParser();

}

Expand Down Expand Up @@ -150,7 +153,7 @@ public AbstractUnit importItem(Map<String, Object> itemData, List<String> idPath
* @throws ValidationError when data constraints are not met
*/
protected Bundle getDescription(Map<String, Object> itemData) throws ValidationError {
List<Map<String, Object>> extractedDates = ImportHelpers.extractDates(itemData);
List<Map<String, Object>> extractedDates = dateParser.extractDates(itemData);

Map<String, Object> raw = ImportHelpers.extractDescription(itemData, EntityClass.DOCUMENTARY_UNIT_DESCRIPTION);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ public class SaxImportManager extends AbstractImportManager {

private static final Logger logger = LoggerFactory.getLogger(SaxImportManager.class);

private static final Config config = ConfigFactory.load();
private final Class<? extends SaxXmlHandler> handlerClass;
private final ImportOptions options;
private final List<ImportCallback> extraCallbacks;
Expand Down
108 changes: 10 additions & 98 deletions ehri-io/src/main/java/eu/ehri/project/importers/util/DateParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,17 @@
import eu.ehri.project.definitions.Entities;
import eu.ehri.project.definitions.Ontology;
import eu.ehri.project.importers.properties.XmlImportProperties;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static eu.ehri.project.importers.util.ImportHelpers.getSubNode;

/**
* This class contains static functions to extract date information from
* largely unstructured maps.
* Class for extracting date info from unstructured or semi-structured data and text.
*
* There are two main scenarios:
*
Expand All @@ -37,43 +28,15 @@
* Notable, the function that returns the dates removes the data from
* which they were extracted
*/
class DateParser {
public class DateParser {

private static final Logger logger = LoggerFactory.getLogger(DateParser.class);

// Various date patterns
private static final Pattern[] datePatterns = {
// Yad Vashem, ICA-Atom style: 1924-1-1 - 1947-12-31
// Yad Vashem in Wp2: 12-15-1941, 9-30-1944
Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})\\s?-\\s?(\\d{4}-\\d{1,2}-\\d{1,2})$"),
Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})$"),
Pattern.compile("^(\\d{4})\\s?-\\s?(\\d{4})$"),
Pattern.compile("^(\\d{4})-\\[(\\d{4})\\]$"),
Pattern.compile("^(\\d{4})-\\[(\\d{4})\\]$"),
Pattern.compile("^(\\d{4}s)-\\[(\\d{4}s)\\]$"),
Pattern.compile("^\\[(\\d{4})\\]$"),
Pattern.compile("^(\\d{4})$"),
Pattern.compile("^(\\d{2})th century$"),
Pattern.compile("^\\s*(\\d{4})\\s*-\\s*(\\d{4})"),
//bundesarchive: 1906/19
Pattern.compile("^\\s*(\\d{4})/(\\d{2})"),
Pattern.compile("^\\s*(\\d{4})\\s*/\\s*(\\d{4})"),
Pattern.compile("^(\\d{4}-\\d{1,2})/(\\d{4}-\\d{1,2})"),
Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})/(\\d{4}-\\d{1,2}-\\d{1,2})"),
Pattern.compile("^(\\d{4})/(\\d{4}-\\d{1,2}-\\d{1,2})")
};

// NB: Using English locale here to avoid ambiguities caused by system dependent
// time zones such as: Cannot parse "1940-05-16": Illegal instant due to time zone
// offset transition (Europe/Amsterdam)
// https://en.wikipedia.org/wiki/UTC%2B00:20
private static final DateTimeFormatter isoDateTimeFormat = ISODateTimeFormat.date().withLocale(Locale.ENGLISH);

// NB: Not static yet since these objects aren't thread safe :(
private static final SimpleDateFormat yearMonthDateFormat = new SimpleDateFormat("yyyy-MM");
private static final SimpleDateFormat yearDateFormat = new SimpleDateFormat("yyyy");
private static final XmlImportProperties dates = new XmlImportProperties("dates.properties");
private final DateRangeParser rangeParser;

public DateParser() {
rangeParser = new DateRangeParser();
}

/**
* Extract a set of dates from input data. The input data is mutated to
Expand All @@ -82,7 +45,7 @@ class DateParser {
* @param data a map of input data
* @return a list of parsed date period maps
*/
static List<Map<String, Object>> extractDates(Map<String, Object> data) {
public List<Map<String, Object>> extractDates(Map<String, Object> data) {
List<Map<String, Object>> extractedDates = Lists.newArrayList();

if (data.containsKey(Entities.DATE_PERIOD)) {
Expand All @@ -108,7 +71,7 @@ static List<Map<String, Object>> extractDates(Map<String, Object> data) {
return extractedDates;
}

private static void replaceDates(Map<String, Object> data, List<Map<String, Object>> extractedDates, Map<String, String> dateValues) {
private void replaceDates(Map<String, Object> data, List<Map<String, Object>> extractedDates, Map<String, String> dateValues) {
Map<String, String> dateTypes = Maps.newHashMap();
for (String dateValue : dateValues.keySet()) {
dateTypes.put(dateValues.get(dateValue), null);
Expand All @@ -134,23 +97,8 @@ private static void replaceDates(Map<String, Object> data, List<Map<String, Obje
}
}

private static Optional<Map<String, Object>> extractDate(String date) {
Map<String, Object> data = matchDate(date);
return data.isEmpty() ? Optional.empty() : Optional.of(data);
}

private static Map<String, Object> matchDate(String date) {
Map<String, Object> data = Maps.newHashMap();
for (Pattern re : datePatterns) {
Matcher matcher = re.matcher(date);
if (matcher.matches()) {
data.put(Ontology.DATE_PERIOD_START_DATE, normaliseDate(matcher.group(1)));
data.put(Ontology.DATE_PERIOD_END_DATE, normaliseDate(matcher.group(matcher.groupCount() > 1 ? 2 : 1), true));
data.put(Ontology.DATE_HAS_DESCRIPTION, date);
break;
}
}
return data;
private Optional<Map<String, Object>> extractDate(String date) {
return rangeParser.parse(date).map(DateRange::data);
}

private static Map<String, String> returnDatesAsString(Map<String, Object> data) {
Expand All @@ -172,40 +120,4 @@ private static Map<String, String> returnDatesAsString(Map<String, Object> data)
}
return datesAsString;
}

static String normaliseDate(String date) {
return normaliseDate(date, false);
}

/**
* Normalise a date in a string.
*
* @param date a String date that needs formatting
* @param endOfPeriod a string signifying whether this date is the begin of
* a period or the end of a period
* @return a String containing the formatted date.
*/
static String normaliseDate(String date, boolean endOfPeriod) {
String returnDate = isoDateTimeFormat.print(DateTime.parse(date));
if (returnDate.startsWith("00")) {
returnDate = "19" + returnDate.substring(2);
date = "19" + date;
}
if (endOfPeriod) {
if (!date.equals(returnDate)) {
ParsePosition p = new ParsePosition(0);
yearMonthDateFormat.parse(date, p);
if (p.getIndex() > 0) {
returnDate = isoDateTimeFormat.print(DateTime.parse(date).plusMonths(1).minusDays(1));
} else {
p = new ParsePosition(0);
yearDateFormat.parse(date, p);
if (p.getIndex() > 0) {
returnDate = isoDateTimeFormat.print(DateTime.parse(date).plusYears(1).minusDays(1));
}
}
}
}
return returnDate;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package eu.ehri.project.importers.util;

import eu.ehri.project.definitions.Ontology;
import org.apache.jena.ext.com.google.common.collect.Maps;

import java.time.Instant;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.Map;

public class DateRange {

private static final DateTimeFormatter isoDateFormat = DateTimeFormatter.ISO_LOCAL_DATE;
private static final DateTimeFormatter isoDateTimeFormat = DateTimeFormatter.ISO_LOCAL_DATE_TIME;

private final Instant start;
private final Instant end;
private final String description;

public DateRange(Instant start, Instant end, String description) {
if (start == null) {
throw new IllegalArgumentException("DateRange start must not be null");
}
this.start = start;
this.end = end;
this.description = description;
}

public DateRange(Instant start, String description) {
this(start, null, description);
}

@Override
public String toString() {
return end != null
? String.format("%s - %s", toLocalDateString(start), toLocalDateString(end))
: toLocalDateString(start);
}

public Map<String, Object> data() {
Map<String, Object> data = Maps.newHashMap();
data.put(Ontology.DATE_PERIOD_START_DATE, toLocalDateString(start));
if (end != null) {
data.put(Ontology.DATE_PERIOD_END_DATE, toLocalDateString(end));
}
if (description != null) {
data.put(Ontology.DATE_HAS_DESCRIPTION, description);
}
return data;
}

private String toLocalDateString(Instant instant) {
return instant.atZone(ZoneId.systemDefault()).format(isoDateFormat);
}

private String toLocalDateTimeString(Instant instant) {
return instant.atZone(ZoneId.systemDefault()).format(isoDateTimeFormat);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
package eu.ehri.project.importers.util;

import com.github.sisyphsu.dateparser.DateParser;

import java.time.Instant;
import java.time.format.DateTimeParseException;
import java.util.Calendar;
import java.util.Date;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DateRangeParser {

private static final Pattern yearRange = Pattern.compile("^(?<start>\\d{4})\\s?[\\-/]\\s?(?<end>\\d{4})$");

private final DateParser parser;

public DateRangeParser() {
parser = DateParser.newBuilder()
.addRule("(?<year>\\d{4}) ca\\.?")
.addRule("summer (?<year>\\d{4})")
.addRule("(?<year>\\d{4})[/\\-](?<month>\\d)[/\\-](?<day>\\d)")
.addRule("(?<month>\\d{2})/(?<year>\\d{4})")
.addRule("(?<year>\\d{4})-(?<month>\\d{2})")
.build();
}

public static Optional<DateRange> parseDate(String str) {
DateRangeParser parser = new DateRangeParser();
return parser.parse(str);
}

private Optional<DateRange> parseRange(String from, String to, String orig) {

final Calendar d1 = parser.parseCalendar(from);
final Calendar d2 = parser.parseCalendar(to);

// If we don't have a specific day or month, set these to the appropriate maximum...
if (to.length() == 4) {
d2.set(Calendar.MONTH, d2.getActualMaximum(Calendar.MONTH));
d2.set(Calendar.DAY_OF_MONTH, d2.getActualMaximum(Calendar.DAY_OF_MONTH));
} else if (to.replaceAll("\\D", "").length() < 8) {
// FIXME: this heuristic could fail on date with 2-digit year,
// like 201240, which we don't support anyway...
d2.set(Calendar.DAY_OF_MONTH, d2.getActualMaximum(Calendar.DAY_OF_MONTH));
}

return Optional.of(new DateRange(d1.toInstant(), d2.toInstant(), orig));
}

private Optional<DateRange> parseSingle(String date, String orig) {
final Calendar d = parser.parseCalendar(date);
Instant end = null;
if (date.length() == 4) {
Calendar d2 = (Calendar) d.clone();
d2.set(Calendar.MONTH, d2.getActualMaximum(Calendar.MONTH));
d2.set(Calendar.DAY_OF_MONTH, d2.getActualMaximum(Calendar.DAY_OF_MONTH));
end = d2.toInstant();
}
return Optional.of(new DateRange(d.toInstant(), end, orig));
}

public Optional<DateRange> parse(String str) {
try {
// See if the string matches a year range...
final Matcher matcher = yearRange.matcher(str);
if (matcher.matches()) {
return parseRange(matcher.group("start"), matcher.group("end"), str);

} else if (str.contains(" - ")) {
final String[] parts = str.split("\\s-\\s");
return parseRange(parts[0], parts[1], str);
} else if (str.length() > 12 && (str.charAt(str.length() / 2) == '-' || str.charAt(str.length() / 2) == '/')) {
// Heuristics: if a string is longer than 13 chars and the
// middle char is a '-', assume it's a date range...
return parseRange(
str.subSequence(0, str.length() / 2).toString().trim(),
str.subSequence((str.length() / 2) + 1, str.length()).toString().trim(), str);
} else {
// Otherwise, attempt to parse as a single date, or fail...
return parseSingle(str, str);
}
} catch (IllegalArgumentException | DateTimeParseException e) {
// TODO ? anything else?
e.printStackTrace();
return Optional.empty();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,8 @@ public static Map<String, Object> extractAddress(Map<String, Object> itemData) {
* @return a list of entity bundles
*/
public static List<Map<String, Object>> extractDates(Map<String, Object> data) {
return DateParser.extractDates(data);
DateParser dateParser = new DateParser();
return dateParser.extractDates(data);
}

/**
Expand Down
Loading

0 comments on commit 2d48bd2

Please sign in to comment.