epub:html-splitter html-splitter
epubtools/modules/html-splitter/xpl/html-splitter.xpl
Import URI: http://transpect.le-tex.de/epubtools/modules/html-splitter/xpl/html-splitter.xpl
Sample invocation (for debugging purposes):
calabash/calabash.sh -i source=file:/$(cygpath -ma ../content/output/debug/epubtools/create-ops/pre-split.html) -i conf=file:/$(cygpath -ma adaptions/publisher/series/epubtools/heading-conf.xml) -o result=tmp.html -o report=report.xml -o files=files.xml file:/$(cygpath -ma epubtools/modules/html-splitter/xpl/html-splitter.xpl) base-uri=file:/$(cygpath -ma ../content/output/debug/epubtools/create-ops/pre-split.html) debug=yes debug-dir-uri=file:/$(cygpath -ma ../content/output/debug)
Calabash seems to suppress some XSLT errors, for instance when a stylesheet is looping. Therefore it might be necessary to replace collection()[…] with document(…) in the XSL (alternative variable declarations are already included in the xsl file, commented out) and run saxon from the command line, for example like this:
PRE_SPLIT=file:/$(cygpath -ma ../content/le-tex/whitepaper/de/output/output/debug/epubtools/create-ops/pre-split.html) saxon -xsl:epubtools/modules/html-splitter/xsl/html-splitter.xsl -s:$PRE_SPLIT -it:main \ debug-dir-uri=file:/$(cygpath -ma debug) \ debug=yes \ final-pub-type=EPUB2 \ heading-conf-uri=file:/$(cygpath -ma adaptions/common/epubtools/heading-conf.xml) \ meta-uri=file:/$(cygpath -ma ../content/le-tex/whitepaper/de/output/output/debug/epubtools/epub-config.xml) \ datadir=file:/$(cygpath -ma debug/datadir)
Input Ports
Name | Documentation | Connections |
---|---|---|
sourceⓅ | ||
confⓈ | /hierarchy – may be included in /epub-config | |
meta | /epub-config | |
css-xml | XML representation of the parsed CSS |
Output Ports
Name | Documentation | Connections |
---|---|---|
resultⓅ | ||
files | ||
report |
Options
Name | Documentation | Default |
---|---|---|
base-uriⓇ | ||
target | 'EPUB2' | |
debug | 'no' | |
debug-dir-uri | 'debug' |
Subpipeline
Step | Inputs | Outputs | Options | ||||
---|---|---|---|---|---|---|---|
p:variable css-handling | (/epub-config/@css-handling, 'regenerated-per-split')[1] | ||||||
p:identity strip-leading-non-elements |
| result | |||||
p:group html-splitter-group | |||||||
p:variable workdir | replace($base-uri, '^(.*[/])+(.*)', '$1') | ||||||
p:variable basename | replace($base-uri, '^(.*[/])+(.*?)(\.[\w.]+)$', '$2') | ||||||
p:variable indent | (/epub-config/@indent, 'true')[1] | ||||||
letex:store-debug d94e102 |
| result | pipeline-step = concat('epubtools/html-splitter/', $basename, '/splitter-input') active = $debug base-uri = $debug-dir-uri | ||||
p:xslt split |
| result | template-name = 'main' | ||||
letex:store-debug d94e141 | result | pipeline-step = concat('epubtools/html-splitter/', $basename, '/chunks') active = $debug base-uri = $debug-dir-uri | |||||
p:sink d94e150 | |||||||
p:choose per-split-css | |||||||
$css-handling = 'regenerated-per-split' | |||||||
p:xslt per-split-css-xml-representations Primary output: the new, reduced common CSS. Secondary port: individual CSS files if applicable. |
| result | |||||
p:sink d94e192 |
| ||||||
p:xslt insert-individual-css-link |
| result | template-name = 'main' | ||||
p:for-each generate-css | |||||||
css:generate gen |
| result | prepend-resource-path = '../' | ||||
$css-handling = 'unchanged' | |||||||
p:identity d94e231 | result | ||||||
p:otherwise | regenerated | ||||||
css:generate gen |
| result | prepend-resource-path = '../' | ||||
p:sink d94e255 | |||||||
p:identity d94e257 | result | ||||||
p:for-each store-chunks | |||||||
p:variable chunk-file-uri base-uri(/*) instead of base-uri() because we set the base uri of the primary CSS by adding an xml:base attribute. | replace(base-uri(/*), 'chunks/', 'epub/OEBPS/') | ||||||
p:choose d94e290 | |||||||
matches($chunk-file-uri, '\.ncx$' ) | |||||||
p:store store-chunk |
| result | include-content-type = 'true' omit-xml-declaration = 'false' href = $chunk-file-uri doctype-public = if($target eq 'EPUB3') then '' else '-//NISO//DTD ncx 2005-1//EN' doctype-system = if($target eq 'EPUB3') then '' else 'http://www.daisy.org/z3986/2005/ncx-2005-1.dtd' | ||||
matches($chunk-file-uri, '\.(txt|css)$') | |||||||
p:store d94e310 |
| result | method = 'text' encoding = 'UTF-8' href = $chunk-file-uri | ||||
$target eq 'EPUB3' | |||||||
p:store store-chunk |
| result | include-content-type = 'false' omit-xml-declaration = 'false' method = 'xhtml' indent = if ($indent = 'true') then 'true' else 'false' href = $chunk-file-uri | ||||
$target = 'EPUB2' and matches(base-uri(), 'nav\.xhtml$') | |||||||
p:sink d94e333 |
| ||||||
p:otherwise | |||||||
p:delete d94e340 |
| result | match = '@epub:type | html:nav[@epub:type = 'landmarks']' | ||||
p:store store-chunk | result | include-content-type = 'true' omit-xml-declaration = 'false' method = 'xhtml' doctype-public = '-//W3C//DTD XHTML 1.1//EN' doctype-system = 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' indent = if ($indent = 'true') then 'true' else 'false' href = $chunk-file-uri | |||||
p:xslt collect-file-uri |
| result | |||||
p:sink d94e367 |
| ||||||
p:for-each signal-splitting-error | The presence of an orig.txt is an indicator that the split text differs from the original text. We’ll raise an error. We don’t do it immediately within the split step because we want to store the results first so that you can do forensics. | ||||||
p:add-attribute orig-txt-url |
| result | match = '/html:p/html:a[1]' attribute-name = 'href' attribute-value = base-uri() | ||||
p:add-attribute chunks-txt-url |
| result | match = '/html:p/html:a[2]' attribute-name = 'href' attribute-value = replace(base-uri(), 'orig\.txt$', 'chunks.txt') | ||||
p:error splitting-error |
| result | code = 'epub:SPLT01' | ||||
p:wrap-sequence wrap-chunks |
| result | wrapper = 'document' wrapper-namespace = 'http://xmlcalabash.com/ns/extensions' wrapper-prefix = 'cx' | ||||
p:wrap-sequence wrap-chunk-uris |
| result | wrapper = 'document' wrapper-namespace = 'http://xmlcalabash.com/ns/extensions' wrapper-prefix = 'cx' | ||||
p:add-attribute d94e432 |
| result | match = '/*' attribute-name = 'transpect:step-name' attribute-value = 'html-splitter' | ||||
p:add-attribute report | result | match = '/*' attribute-name = 'transpect:rule-family' attribute-value = 'html-splitter' |