-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
WIP: use a library for sloppy date parsing.
Remove support for dangerous Bundesarchiv date range format (1906/08, meaning the years) and add support for e.g. `1933 ca.` and `1939/09 - 1945/05`. More refactoring and tests are needed here.
- Loading branch information
Showing
14 changed files
with
232 additions
and
144 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
59 changes: 59 additions & 0 deletions
59
ehri-io/src/main/java/eu/ehri/project/importers/util/DateRange.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
package eu.ehri.project.importers.util; | ||
|
||
import eu.ehri.project.definitions.Ontology; | ||
import org.apache.jena.ext.com.google.common.collect.Maps; | ||
|
||
import java.time.Instant; | ||
import java.time.ZoneId; | ||
import java.time.format.DateTimeFormatter; | ||
import java.util.Map; | ||
|
||
public class DateRange { | ||
|
||
private static final DateTimeFormatter isoDateFormat = DateTimeFormatter.ISO_LOCAL_DATE; | ||
private static final DateTimeFormatter isoDateTimeFormat = DateTimeFormatter.ISO_LOCAL_DATE_TIME; | ||
|
||
private final Instant start; | ||
private final Instant end; | ||
private final String description; | ||
|
||
public DateRange(Instant start, Instant end, String description) { | ||
if (start == null) { | ||
throw new IllegalArgumentException("DateRange start must not be null"); | ||
} | ||
this.start = start; | ||
this.end = end; | ||
this.description = description; | ||
} | ||
|
||
public DateRange(Instant start, String description) { | ||
this(start, null, description); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return end != null | ||
? String.format("%s - %s", toLocalDateString(start), toLocalDateString(end)) | ||
: toLocalDateString(start); | ||
} | ||
|
||
public Map<String, Object> data() { | ||
Map<String, Object> data = Maps.newHashMap(); | ||
data.put(Ontology.DATE_PERIOD_START_DATE, toLocalDateString(start)); | ||
if (end != null) { | ||
data.put(Ontology.DATE_PERIOD_END_DATE, toLocalDateString(end)); | ||
} | ||
if (description != null) { | ||
data.put(Ontology.DATE_HAS_DESCRIPTION, description); | ||
} | ||
return data; | ||
} | ||
|
||
private String toLocalDateString(Instant instant) { | ||
return instant.atZone(ZoneId.systemDefault()).format(isoDateFormat); | ||
} | ||
|
||
private String toLocalDateTimeString(Instant instant) { | ||
return instant.atZone(ZoneId.systemDefault()).format(isoDateTimeFormat); | ||
} | ||
} |
90 changes: 90 additions & 0 deletions
90
ehri-io/src/main/java/eu/ehri/project/importers/util/DateRangeParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
package eu.ehri.project.importers.util; | ||
|
||
import com.github.sisyphsu.dateparser.DateParser; | ||
|
||
import java.time.Instant; | ||
import java.time.format.DateTimeParseException; | ||
import java.util.Calendar; | ||
import java.util.Date; | ||
import java.util.Optional; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
public class DateRangeParser { | ||
|
||
private static final Pattern yearRange = Pattern.compile("^(?<start>\\d{4})\\s?[\\-/]\\s?(?<end>\\d{4})$"); | ||
|
||
private final DateParser parser; | ||
|
||
public DateRangeParser() { | ||
parser = DateParser.newBuilder() | ||
.addRule("(?<year>\\d{4}) ca\\.?") | ||
.addRule("summer (?<year>\\d{4})") | ||
.addRule("(?<year>\\d{4})[/\\-](?<month>\\d)[/\\-](?<day>\\d)") | ||
.addRule("(?<month>\\d{2})/(?<year>\\d{4})") | ||
.addRule("(?<year>\\d{4})-(?<month>\\d{2})") | ||
.build(); | ||
} | ||
|
||
public static Optional<DateRange> parseDate(String str) { | ||
DateRangeParser parser = new DateRangeParser(); | ||
return parser.parse(str); | ||
} | ||
|
||
private Optional<DateRange> parseRange(String from, String to, String orig) { | ||
|
||
final Calendar d1 = parser.parseCalendar(from); | ||
final Calendar d2 = parser.parseCalendar(to); | ||
|
||
// If we don't have a specific day or month, set these to the appropriate maximum... | ||
if (to.length() == 4) { | ||
d2.set(Calendar.MONTH, d2.getActualMaximum(Calendar.MONTH)); | ||
d2.set(Calendar.DAY_OF_MONTH, d2.getActualMaximum(Calendar.DAY_OF_MONTH)); | ||
} else if (to.replaceAll("\\D", "").length() < 8) { | ||
// FIXME: this heuristic could fail on date with 2-digit year, | ||
// like 201240, which we don't support anyway... | ||
d2.set(Calendar.DAY_OF_MONTH, d2.getActualMaximum(Calendar.DAY_OF_MONTH)); | ||
} | ||
|
||
return Optional.of(new DateRange(d1.toInstant(), d2.toInstant(), orig)); | ||
} | ||
|
||
private Optional<DateRange> parseSingle(String date, String orig) { | ||
final Calendar d = parser.parseCalendar(date); | ||
Instant end = null; | ||
if (date.length() == 4) { | ||
Calendar d2 = (Calendar) d.clone(); | ||
d2.set(Calendar.MONTH, d2.getActualMaximum(Calendar.MONTH)); | ||
d2.set(Calendar.DAY_OF_MONTH, d2.getActualMaximum(Calendar.DAY_OF_MONTH)); | ||
end = d2.toInstant(); | ||
} | ||
return Optional.of(new DateRange(d.toInstant(), end, orig)); | ||
} | ||
|
||
public Optional<DateRange> parse(String str) { | ||
try { | ||
// See if the string matches a year range... | ||
final Matcher matcher = yearRange.matcher(str); | ||
if (matcher.matches()) { | ||
return parseRange(matcher.group("start"), matcher.group("end"), str); | ||
|
||
} else if (str.contains(" - ")) { | ||
final String[] parts = str.split("\\s-\\s"); | ||
return parseRange(parts[0], parts[1], str); | ||
} else if (str.length() > 12 && (str.charAt(str.length() / 2) == '-' || str.charAt(str.length() / 2) == '/')) { | ||
// Heuristics: if a string is longer than 13 chars and the | ||
// middle char is a '-', assume it's a date range... | ||
return parseRange( | ||
str.subSequence(0, str.length() / 2).toString().trim(), | ||
str.subSequence((str.length() / 2) + 1, str.length()).toString().trim(), str); | ||
} else { | ||
// Otherwise, attempt to parse as a single date, or fail... | ||
return parseSingle(str, str); | ||
} | ||
} catch (IllegalArgumentException | DateTimeParseException e) { | ||
// TODO ? anything else? | ||
e.printStackTrace(); | ||
return Optional.empty(); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.