Skip to content

Commit

Permalink
Merge pull request #27 from dariok/parametrize
Browse files Browse the repository at this point in the history
Parameters for various functions
  • Loading branch information
dariok authored Dec 4, 2023
2 parents 6292120 + 117a0d8 commit c184c12
Show file tree
Hide file tree
Showing 17 changed files with 4,839 additions and 1,103 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
/tests-wd/
/tests2/
AS/
page2tei-output/

20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,25 @@ Apply page2tei-0.xsl to the METS File:
java -jar saxon9he.jar -xsl:page2tei-0.xsl -s:mets.xml -o:[your tei file].xml
```

Additional stylesheets can be applied to the output created by the basic transformation.
Additional stylesheets can be applied to the output created by the basic transformation:
- `combine-continued.xsl` (or set parameter `combine=true()`) — try to combine entities that are split over a line break into one element
- `simplify-coordinates.xsl` (parameter `bounding-rectangles=true()` by default) — convert polygons into bounding rectangles
- `tokenize.xsl` (or set parameter `tokenize=true()`) — perform (very basic!) whitespace tokenization

## Parameters
You can set the following parameters when calling `page2tei-0.xsl` (via command line or via an oXygen scenario; in oXygen, the parameters should be marked as “XPath“):

- rs (default: `true()`): create `rs type="..."` for person/place/org (default) or `persName` etc.
- tokenize (default: `false()`): Whether to run white space tokenization
- combine (default: `false()`): Whether to combine entities over line breaks
- ab (default: `false()`): If false(), region types that correspond to valid TEI elements will be returned as
this element; types that do not correspond to a TEI element will be returned as
tei:ab[@type]. If set to true(), all region types (except for paragraph, heading) will be
returned as tei:ab.
- word-coordinates (default: `false()`): If true(), export the (estimated) word coordinates to the facsimile section.
- bounding-rectangles (default: `true()`): Whether to create bounding rectangles from polygons (default: true())
- withoutBaseline (default: `false()`): Whether to export lines without baseline or not
- withoutTextline (default: `false()`): Whether to export regions without text lines


## Contributors
Expand Down
46 changes: 40 additions & 6 deletions combine-continued.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,51 @@
</xd:doc>

<xd:doc>
<xd:desc>Combine continued rs</xd:desc>
<xd:desc>
<xd:p>Combine continued elements (e.g. rs)</xd:p>
<xd:p>This works on an element that contains (more than one) continued element (there may be just one element
if the continuation happens across region borders).</xd:p>
</xd:desc>
</xd:doc>
<xsl:template match="tei:*[tei:*[@continued = 'true']]" mode="continued">
<xsl:template match="tei:*[count(tei:*[@continued = 'true']) gt 1]" mode="continued">
<xsl:copy>
<xsl:apply-templates select="@*" mode="continued" />
<xsl:for-each-group select="node()"
group-starting-with="tei:*[@continued eq 'true' and normalize-space(preceding::text()[1]) != '']">
group-starting-with="tei:*[
@continued eq 'true'
and normalize-space() != ''
and (normalize-space(preceding::text()[1]) != '' or preceding::text()[1][not(preceding-sibling::*)])
]">

<xsl:choose>
<xsl:when test="current-group()[1][@continued eq 'true' and tei:abbr]">
<!-- we assume there is exactly 2 choice with one lb in between, so no multi-line abbreviations:
1=choice, 2=text(), 3=lb, 4=text(), 5=choice -->
<choice>
<expan>
<xsl:sequence select="current-group()[1]/tei:expan/node()" />
</expan>
<abbr>
<xsl:sequence select="current-group()[1]/tei:abbr/node()" />
<xsl:sequence select="current-group()[3]" />
<xsl:sequence select="current-group()[4]/tei:abbr/node()" />
</abbr>
</choice>
<xsl:apply-templates select="current-group()[position() gt 4]" mode="continued" />
</xsl:when>
<xsl:when test="current-group()[1][@continued eq 'true']">
<xsl:variable name="final"
select="(current-group()[position() gt 1 and @continued = 'true'][last()], current-group()[4])[1]" />
<xsl:variable
name="final"
select="
(
current-group()[
position() gt 1
and @continued = 'true'
and node()
][last()],
current-group()[4]
)[1]"
/>
<xsl:variable name="last" select="index-of(current-group(), $final)[1]"/>
<xsl:element name="{local-name()}">
<xsl:apply-templates select="@*" mode="continued" />
Expand All @@ -46,7 +80,7 @@
<xd:desc>lb will be returned unaltered</xd:desc>
</xd:doc>
<xsl:template match="tei:lb" mode="rs-continued">
<lb break="no">
<lb>
<xsl:sequence select="@*" />
</lb>
</xsl:template>
Expand Down
49 changes: 49 additions & 0 deletions combine-hi.xsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:tei="http://www.tei-c.org/ns/1.0"
xmlns:math="http://www.w3.org/2005/xpath-functions/math"
xmlns="http://www.tei-c.org/ns/1.0"
exclude-result-prefixes="#all"
version="3.0">

<xsl:template match="*[tei:hi]" mode="combine-hi">
<xsl:copy>
<xsl:sequence select="@*" />
<xsl:for-each-group select="node()" group-adjacent="@style or normalize-space() = ''">
<xsl:choose>
<xsl:when test="current-group()[self::tei:hi]">
<xsl:variable name="firstHi" select="current-group()[self::tei:hi][1]" />

<xsl:sequence select="current-group() intersect $firstHi/preceding-sibling::node()" />
<hi>
<xsl:sequence select="$firstHi/@style" />
<xsl:apply-templates
select="$firstHi | current-group()[position() != last()] intersect $firstHi/following-sibling::node()"
mode="do-combine-hi" />
<xsl:if test="count(current-group()) gt 1 and current-group()[last()] != ' ' and not(current-group()[last()] is $firstHi)">
<xsl:apply-templates select="current-group()[last()]" mode="do-combine-hi" />
</xsl:if>
</hi>
<xsl:if test="current-group()[last()] = ' '">
<xsl:text> </xsl:text>
</xsl:if>
</xsl:when>
<xsl:otherwise>
<xsl:apply-templates select="current-group()" mode="combine-hi"/>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each-group>
</xsl:copy>
</xsl:template>

<xsl:template match="tei:hi" mode="do-combine-hi">
<xsl:apply-templates mode="combine-hi"/>
</xsl:template>

<xsl:template match="@* | node()" mode="combine-hi do-combine-hi">
<xsl:copy>
<xsl:apply-templates select="@* | node()" mode="#current"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
124 changes: 109 additions & 15 deletions page2tei-0.xsl
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
<?xml version="1.0" encoding="UTF-8"?><xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl"
xmlns="http://www.tei-c.org/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0"
xmlns:p="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"
Expand All @@ -9,6 +8,11 @@
version="3.0">

<xsl:output indent="0"/>

<xsl:variable name="langs" select="map { 'French': 'fr', 'German': 'de', 'English': 'en', 'Latin': 'la',
'Spanish': 'es', 'Ancient Greek': 'grc' }"/>

<xsl:include href="combine-hi.xsl" />

<xd:doc>
<xd:desc>Whether to create `rs type="..."` for person/place/org (default) or `persName` etc.
Expand Down Expand Up @@ -46,6 +50,7 @@
<xd:desc>Whether to create bounding rectangles from polygons (default: true())</xd:desc>
</xd:doc>
<xsl:param name="bounding-rectangles" select="true()"/>
<xsl:include href="simplify-coordinates.xsl" />

<xd:doc>
<xd:desc>Whether to export lines without baseline (true()) or not (false(), default)</xd:desc>
Expand All @@ -57,6 +62,12 @@
default)</xd:desc>
</xd:doc>
<xsl:param name="withoutTextline" select="false()"/>

<xd:doc>
<xd:desc>Whether to export custom attributes from tags that we do not know how to convert to valid TEI (true(),
default) or whether to discard them (false()).</xd:desc>
</xd:doc>
<xsl:param name="unknownAttributes" select="true()" />

<xd:doc scope="stylesheet">
<xd:desc>
Expand Down Expand Up @@ -92,6 +103,13 @@

<xsl:param name="debug" select="false()"/>

<xd:doc>
<xd:desc>Entry</xd:desc>
</xd:doc>
<xsl:template match="/">
<xsl:apply-templates select="mets:mets" />
</xsl:template>

<xd:doc>
<xd:desc>helper: gather page contents</xd:desc>
</xd:doc>
Expand Down Expand Up @@ -137,12 +155,29 @@
</xsl:text>
</fileDesc>
<xsl:text>
</xsl:text>
<profileDesc>
<xsl:apply-templates select="descendant::*:trpDocMetadata/*:language" />
<xsl:text>
</xsl:text>
</profileDesc>
<xsl:text>
</xsl:text>
</teiHeader>
<xsl:text>
</xsl:text>
<facsimile>
<xsl:apply-templates select="mets:fileSec//mets:fileGrp[@ID = 'PAGEXML']/mets:file" mode="facsimile"/>
<xsl:choose>
<xsl:when test="$bounding-rectangles">
<xsl:variable name="facs">
<xsl:apply-templates select="mets:fileSec//mets:fileGrp[@ID = 'PAGEXML']/mets:file" mode="facsimile"/>
</xsl:variable>
<xsl:apply-templates select="$facs" mode="bounding-rectangle" />
</xsl:when>
<xsl:otherwise>
<xsl:apply-templates select="mets:fileSec//mets:fileGrp[@ID = 'PAGEXML']/mets:file" mode="facsimile"/>
</xsl:otherwise>
</xsl:choose>
<xsl:text>
</xsl:text>
</facsimile>
Expand Down Expand Up @@ -170,13 +205,16 @@
</xsl:otherwise>
</xsl:choose>
</xsl:variable>
<xsl:variable name="combined-hi">
<xsl:apply-templates select="$combined" mode="combine-hi" />
</xsl:variable>
<xsl:variable name="tokenized">
<xsl:choose>
<xsl:when test="$tokenize">
<xsl:apply-templates select="$combined" mode="tokenize" />
<xsl:apply-templates select="$combined-hi" mode="tokenize" />
</xsl:when>
<xsl:otherwise>
<xsl:copy-of select="$combined" />
<xsl:copy-of select="$combined-hi" />
</xsl:otherwise>
</xsl:choose>
</xsl:variable>
Expand Down Expand Up @@ -304,6 +342,28 @@
<xsl:value-of select="."/>
</idno>
</xsl:template>

<xd:doc>
<xd:desc>Transkribus meta data: languages</xd:desc>
</xd:doc>
<xsl:template match="language">
<xsl:text>
</xsl:text>
<langUsage>
<xsl:for-each select="tokenize(., ', ')">
<xsl:text>
</xsl:text>
<language>
<xsl:attribute name="ident">
<xsl:value-of select="map:get($langs, .)" />
</xsl:attribute>
<xsl:value-of select="." />
</language>
</xsl:for-each>
<xsl:text>
</xsl:text>
</langUsage>
</xsl:template>

<!-- Templates for METS -->
<xd:doc>
Expand Down Expand Up @@ -683,7 +743,7 @@
<xsl:variable name="content" select="substring-after(., '{') => normalize-space()"/>
<xsl:variable name="name" select="substring-before(., ' {') => normalize-space()"/>
<xsl:choose>
<xsl:when test="$content = '' or $name = ('readingOrder', 'structure')"/>
<xsl:when test="not(contains(., 'offset:'))" />
<xsl:otherwise>
<xsl:value-of select="normalize-space()"/>
</xsl:otherwise>
Expand Down Expand Up @@ -713,12 +773,24 @@
</xsl:variable>
<xsl:variable name="prepped">
<xsl:for-each select="0 to string-length($text)">
<xsl:if test=". &gt; 0">
<xsl:if test=".">
<xsl:value-of select="substring($text, ., 1)"/>
</xsl:if>
<!-- place end marker for all non-void elements that end here; we must not place void elements here
as this would mean closing a tei:gap before it was opened -->
<xsl:for-each select="map:get($ends, .)">
<xsl:sort select="substring-before(substring-after(., 'offset:'), ';')"
order="descending"/>
<xsl:sort select="substring(., 1, 3)" order="descending"/>
<xsl:if test="substring-after(., 'length:') => substring-before(';') != '0'">
<xsl:element name="local:m">
<xsl:attribute name="type" select="normalize-space(substring-before(., ' '))"/>
<xsl:attribute name="o" select="substring-after(., 'offset:')"/>
<xsl:attribute name="pos">e</xsl:attribute>
</xsl:element>
</xsl:if>
</xsl:for-each>
<xsl:for-each select="map:get($starts, .)">
<!--<xsl:sort select="substring-before(substring-after(.,'offset:'), ';')" order="ascending"/>-->
<!-- end of current tag -->
<xsl:sort select="
xs:int(substring-before(substring-after(., 'offset:'), ';'))
+ xs:int(substring-before(substring-after(., 'length:'), ';'))"
Expand All @@ -730,15 +802,18 @@
<xsl:attribute name="pos">s</xsl:attribute>
</xsl:element>
</xsl:for-each>
<!-- place end marker for void elements such as tei:gap -->
<xsl:for-each select="map:get($ends, .)">
<xsl:sort select="substring-before(substring-after(., 'offset:'), ';')"
order="descending"/>
<xsl:sort select="substring(., 1, 3)" order="descending"/>
<xsl:element name="local:m">
<xsl:attribute name="type" select="normalize-space(substring-before(., ' '))"/>
<xsl:attribute name="o" select="substring-after(., 'offset:')"/>
<xsl:attribute name="pos">e</xsl:attribute>
</xsl:element>
<xsl:if test="substring-after(., 'length:') => substring-before(';') = '0'">
<xsl:element name="local:m">
<xsl:attribute name="type" select="normalize-space(substring-before(., ' '))"/>
<xsl:attribute name="o" select="substring-after(., 'offset:')"/>
<xsl:attribute name="pos">e</xsl:attribute>
</xsl:element>
</xsl:if>
</xsl:for-each>
</xsl:for-each>
</xsl:variable>
Expand Down Expand Up @@ -840,6 +915,9 @@
<xsl:if test="$custom?underlined = 'true'">
<xsl:text>text-decoration: underline;</xsl:text>
</xsl:if>
<xsl:if test="$custom?strikethrough = 'true'">
<xsl:text>text-decoration: line-through;</xsl:text>
</xsl:if>
<xsl:if test="number($custom?fontSize) gt 0">
<xsl:value-of select="'font-size: ' || $custom?fontSize || 'px;'"/>
</xsl:if>
Expand All @@ -852,6 +930,12 @@
<xsl:if test="$custom?superscript = 'true'">
<xsl:text>vertical-align: superscript;</xsl:text>
</xsl:if>
<xsl:if test="$custom?smallCaps = 'true'">
<xsl:text>font-variant-caps: small-caps;</xsl:text>
</xsl:if>
<xsl:if test="$custom?letterSpaced = 'true'">
<xsl:text>letter-spacing: 5px;</xsl:text>
</xsl:if>
</xsl:variable>
<hi>
<xsl:if test="count($rend) gt 0">
Expand All @@ -871,6 +955,9 @@
</xsl:when>
<xsl:when test="@type = 'abbrev'">
<choice>
<xsl:if test="$custom('continued')">
<xsl:attribute name="continued" select="true()"/>
</xsl:if>
<expan>
<xsl:value-of select="replace(map:get($custom, 'expansion'), '\\u0020', ' ')"/>
</expan>
Expand Down Expand Up @@ -930,7 +1017,14 @@
<xsl:if test="$custom('continued')">
<xsl:attribute name="continued" select="true()"/>
</xsl:if>

<xsl:if test="$unknownAttributes">
<xsl:for-each select="map:keys($custom)">
<xsl:if test="not(. = ('', 'length', 'lastname', 'firstnam'))">
<xsl:attribute name="{.}" select="$custom(.)"/>
</xsl:if>
</xsl:for-each>
</xsl:if>

<xsl:call-template name="elem">
<xsl:with-param name="elem" select="$elem"/>
</xsl:call-template>
Expand Down
Loading

0 comments on commit c184c12

Please sign in to comment.