Skip to content

Commit

Permalink
#24 EML 2.2.0 updates
Browse files Browse the repository at this point in the history
- Change description type from List to String, use DocBook
- Update IPT EML template, use DocBook for description, introduction, gettingStarted, acknowledgements
- Update EmlFactory and tests
  • Loading branch information
mike-podolskiy90 committed Jul 2, 2024
1 parent a39381e commit 92a5207
Show file tree
Hide file tree
Showing 5 changed files with 193 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,58 @@

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.beanutils.MethodUtils;
import org.apache.commons.digester3.Digester;
import org.apache.commons.digester3.NodeCreateRule;
import org.apache.commons.lang3.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;

import com.sun.org.apache.xml.internal.serialize.OutputFormat;
import com.sun.org.apache.xml.internal.serialize.XMLSerializer;

/**
* This class is considered a utility for testing but should be migrated to the source when stable, as this is an EML
* Model Factory based on the Apache Commons Digester and will be used when importing DwC-A.
*/
public class EmlFactory {

// Define pairs of DocBook tags. MUST MATCH HTML tags!
private static final String[] DOCBOOK_TAGS = {
"<section>", "</section>",
"<title>", "</title>",
"<para>", "</para>",
"<itemizedlist>", "</itemizedlist>",
"<listitem>", "</listitem>",
"<orderedlist>", "</orderedlist>",
"<emphasis>", "</emphasis>",
"<subscript>", "</subscript>",
"<superscript>", "</superscript>",
"<literalLayout>", "</literalLayout>"
};

// Define pairs of HTML tags. MUST MATCH DocBook tags!
private static final String[] HTML_TAGS = {
"<div>", "</div>",
"<h1>", "</h1>",
"<p>", "</p>",
"<ul>", "</ul>",
"<li>", "</li>",
"<ol>", "</ol>",
"<b>", "</b>",
"<sub>", "</sub>",
"<sup>", "</sup>",
"<pre>", "</pre>"
};

/**
* Uses rule based parsing to read the EML XML and build the EML model.
* Note the following: - Metadata provider rules are omitted on the assumption that the provider is the same as the
Expand Down Expand Up @@ -89,9 +127,11 @@ public static Eml build(InputStream xml)

digester.addBeanPropertySetter("eml/dataset/language", "language");

// descriptions, broken into multiple paragraphs
digester.addCallMethod("eml/dataset/abstract/para", "addDescriptionPara", 1);
digester.addCallParam("eml/dataset/abstract/para", 0);
// DocBook description, gettingStarted, introduction, acknowledgements
digester.addRule("eml/dataset/abstract", new SetSerializedNodeRule("setDescription", "abstract"));
digester.addRule("eml/dataset/gettingStarted", new SetSerializedNodeRule("setGettingStarted", "gettingStarted"));
digester.addRule("eml/dataset/introduction", new SetSerializedNodeRule("setIntroduction", "introduction"));
digester.addRule("eml/dataset/acknowledgements", new SetSerializedNodeRule("setAcknowledgements", "acknowledgements"));

digester.addBeanPropertySetter("eml/dataset/additionalInfo/para", "additionalInfo");
digester.addRule("eml/dataset/intellectualRights/para", new NodeCreateRule(Node.ELEMENT_NODE));
Expand Down Expand Up @@ -468,4 +508,71 @@ private static void addJGTICuratorialIUnit(Digester digester) {
// JGTICuratorialIUnit to the list in
// EML
}

// Converter to literal XML (DocBook) ant then to HTML
public static class SetSerializedNodeRule extends NodeCreateRule {

private String method;
private String wrapperElement;

public SetSerializedNodeRule() throws ParserConfigurationException {
super(Node.ELEMENT_NODE);
}

public SetSerializedNodeRule(String method, String wrapperElement)
throws ParserConfigurationException {
this.method = method;
this.wrapperElement = wrapperElement;
}

@Override
public void end(String namespace, String name) throws Exception {
Element nodeToSerialize = super.getDigester().pop();
String serializedNode = serializeNode(nodeToSerialize);
invokeMethodOnTopOfStack(method, serializedNode);
}

protected String serializeNode(Element nodeToSerialize) throws Exception {
String htmlOutput;

try (StringWriter writer = new StringWriter()) {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.newDocument();
OutputFormat format = new OutputFormat(doc);
format.setOmitXMLDeclaration(true);
XMLSerializer serializer = new XMLSerializer(writer, format);
serializer.serialize(nodeToSerialize);

String serializedDocBookXml = writer.getBuffer().toString();
String unwrappedDocBookXml = unwrapParentTag(serializedDocBookXml);
htmlOutput = convertDocBookToHtml(unwrappedDocBookXml);
}

return htmlOutput;
}

private String unwrapParentTag(String str) {
return StringUtils.replaceEach(
str,
new String[] {"<" + wrapperElement + ">", "</" + wrapperElement + ">"},
new String[] {"", ""});
}

private String convertDocBookToHtml(String docbookXmlString) {
// Replace links
String docBookXmlStringWithLinksReplaces =
docbookXmlString.replaceAll(
"<ulink\\s+url=\"(.*?)\">\\s*<citetitle>(.*?)</citetitle>\\s*</ulink>",
"<a href=\"$1\">$2</a>");

// Perform replacements
return StringUtils.replaceEach(docBookXmlStringWithLinksReplaces, DOCBOOK_TAGS, HTML_TAGS);
}

protected void invokeMethodOnTopOfStack(String methodName, String param) throws Exception {
Object objOnTopOfStack = getDigester().peek();
MethodUtils.invokeExactMethod(objOnTopOfStack, methodName, param);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
Expand All @@ -56,6 +57,34 @@ public class Eml implements Serializable {

private static final Logger LOG = LoggerFactory.getLogger(Eml.class);

// Define pairs of DocBook tags. MUST MATCH HTML tags!
private static final String[] DOCBOOK_TAGS = {
"<section>", "</section>",
"<title>", "</title>",
"<para>", "</para>",
"<itemizedlist>", "</itemizedlist>",
"<listitem>", "</listitem>",
"<orderedlist>", "</orderedlist>",
"<emphasis>", "</emphasis>",
"<subscript>", "</subscript>",
"<superscript>", "</superscript>",
"<literalLayout>", "</literalLayout>"
};

// Define pairs of HTML tags. MUST MATCH DocBook tags!
private static final String[] HTML_TAGS = {
"<div>", "</div>",
"<h1>", "</h1>",
"<p>", "</p>",
"<ul>", "</ul>",
"<li>", "</li>",
"<ol>", "</ol>",
"<b>", "</b>",
"<sub>", "</sub>",
"<sup>", "</sup>",
"<pre>", "</pre>"
};

private static final Pattern PACKAGED_ID_PATTERN = Pattern.compile("/v([0-9]+(\\.\\d+)?)$");
private static final char SEMICOLON = ';';
private static final char COMMA = ',';
Expand All @@ -77,7 +106,7 @@ public class Eml implements Serializable {
/**
* Description, composed of one or more paragraphs.
*/
private List<String> description = new ArrayList<>();
private String description;

/**
* This is not in the GBIF extended metadata document, but seems like a sensible placeholder that can be used to
Expand Down Expand Up @@ -244,6 +273,11 @@ public class Eml implements Serializable {
*/
private List<MaintenanceChange> maintenanceChangeHistory = new ArrayList<>();

/**
* Current change of maintenance update frequency.
*/
private MaintenanceChange currentMaintenanceChange;

/**
* The 'creator' element provides the full name of the person, organization, or position who created the resource.
* The list of creators for a resource represent the people and organizations who should be cited for the resource.
Expand Down Expand Up @@ -738,6 +772,14 @@ public void setMaintenanceChangeHistory(List<MaintenanceChange> maintenanceChang
this.maintenanceChangeHistory = maintenanceChangeHistory;
}

public MaintenanceChange getCurrentMaintenanceChange() {
return currentMaintenanceChange;
}

public void setCurrentMaintenanceChange(MaintenanceChange currentMaintenanceChange) {
this.currentMaintenanceChange = currentMaintenanceChange;
}

public List<Agent> getCreators() {
return creators;
}
Expand Down Expand Up @@ -849,7 +891,7 @@ public String getCreatorName() {
return null;
}

public List<String> getDescription() {
public String getDescription() {
return getAbstract();
}

Expand Down Expand Up @@ -901,15 +943,6 @@ public void addAlternateIdentifier(String alternateIdentifier) {
alternateIdentifiers.add(alternateIdentifier);
}

/**
* Adds another paragraph to description.
*
* @param para paragraph
*/
public void addDescriptionPara(String para) {
description.add(para);
}

/**
* Utility to add an agent to the creators list. This method was introduced to ease the Digester rules for parsing of
* EML.
Expand Down Expand Up @@ -1064,7 +1097,7 @@ public void parseIntellectualRights(org.w3c.dom.Element element) {
this.intellectualRights = paraXmlToHtml(xmlStr);
}

public List<String> getAbstract() {
public String getAbstract() {
return description;
}

Expand All @@ -1076,7 +1109,7 @@ public String getPackageId() {
return guid + "/v" + emlVersion.toPlainString();
}

public void setAbstract(List<String> description) {
public void setAbstract(String description) {
this.description = description;
}

Expand All @@ -1088,7 +1121,7 @@ public void setCitation(String citation, String identifier) {
this.citation = new Citation(citation, identifier);
}

public void setDescription(List<String> description) {
public void setDescription(String description) {
this.description = description;
}

Expand Down Expand Up @@ -1445,4 +1478,31 @@ public String parseLicenseTitle() {
}
return licenseUrl;
}

// Value with all HTML tags replaced by DocBook analogues
public String getDocBookField(String fieldName) {
String result = null;

try {
String value = BeanUtils.getProperty(this, fieldName);

if (value != null) {
result = replaceDocBookElements(value);
}
} catch (Exception e) {
// TODO log exception
}

return result;
}

private String replaceDocBookElements(String value) {
String htmlStringWithLinksReplaces =
value.replaceAll(
"<a\\s+href=\"(.*?)\">\\s*(.*?)\\s*</a>",
"<ulink url=\"$1\"><citetitle>$2</citetitle></ulink>");

// Perform replacements
return StringUtils.replaceEach(htmlStringWithLinksReplaces, HTML_TAGS, DOCBOOK_TAGS);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -247,13 +247,11 @@
</pubDate>
<language>${eml.language!"en"}</language>
<#-- A description of the resource -->
<#if (eml.abstract?size>0)>
<#if eml.abstract?has_content>
<abstract>
<#list eml.abstract as p>
<#if p?has_content>
<para>${p}</para>
</#if>
</#list>
<#noescape>
${eml.getDocBookField("description")}
</#noescape>
</abstract>
</#if>
<#-- Zero or more sets of keywords and an associated thesaurus for each. -->
Expand Down Expand Up @@ -364,17 +362,17 @@
</#if>
<#if eml.introduction?has_content>
<#noescape>
<introduction>${eml.introduction}</introduction>
<introduction>${eml.getDocBookField("introduction")}</introduction>
</#noescape>
</#if>
<#if eml.gettingStarted?has_content>
<#noescape>
<gettingStarted>${eml.gettingStarted}</gettingStarted>
<gettingStarted>${eml.getDocBookField("gettingStarted")}</gettingStarted>
</#noescape>
</#if>
<#if eml.acknowledgements?has_content>
<#noescape>
<acknowledgements>${eml.acknowledgements}</acknowledgements>
<acknowledgements>${eml.getDocBookField("acknowledgements")}</acknowledgements>
</#noescape>
</#if>
<#if eml.updateFrequency??>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ public void testBuild() {
assertEquals(cal.getTime(), eml.getPubDate());

assertEquals("en_US", eml.getLanguage());
assertEquals("Specimens in jars.", eml.getAbstract().get(0));
assertEquals("<p>Specimens in jars.</p><p>Collected over years.</p><p>Still being curated.</p>", eml.getAbstract());

// multiple KeywordSets tests
assertNotNull(eml.getKeywords());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,7 @@ public void testRoundtrip() {

assertFalse(eml.getDescription().isEmpty());
assertFalse(eml2.getDescription().isEmpty());
assertEquals(eml2.getDescription().get(0), eml.getDescription().get(0));
assertEquals(eml2.getDescription().get(1), eml.getDescription().get(1));
assertEquals(eml2.getDescription().get(2), eml.getDescription().get(2));
assertEquals(eml2.getDescription(), eml.getDescription());

assertNotNull(eml.getPubDate());
assertEquals(eml2.getPubDate(), eml.getPubDate());
Expand Down

0 comments on commit 92a5207

Please sign in to comment.