OCR-D · kba · Nov 3, 2020 · Oct 23, 2020 · Oct 26, 2020 · Oct 26, 2020
diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py
@@ -6,7 +6,7 @@
 
 import os
 import json
-from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE
+from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE, getLogger
 from ocrd_validators import ParameterValidator
 from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
 
@@ -15,9 +15,11 @@
 
 class Processor():
  """
- A processor runs an algorithm based on the workspace, the mets.xml in the
- workspace (and the input files defined therein) as well as optional
- parameter.
+ A processor is an OCR-D compliant command-line-interface for executing
+ a single workflow step on the workspace (represented by local METS). It
+ reads input files for all or requested physical pages of the input fileGrp(s),
+ and writes output files for them into the output fileGrp(s). It may take 
+ a number of optional or mandatory parameters.
  """
 
  def __init__(
@@ -123,3 +125,47 @@ def input_files(self):
  self.input_file_grp
  ))
  return ret
+
+ def zip_input_files(self, require_first=True, mimetype=MIMETYPE_PAGE):
+ """
+ List tuples of input files (for multiple input file groups).
+
+ Processors that expect/need multiple input file groups,
+ cannot use ``input_files``. They must align (zip) input files
+ across pages. This includes the case where not all pages
+ are equally present in all file groups.
+
+ This function does not make much sense for non-PAGE fileGrps,
+ so it uses a fixed MIME type filter for PAGE-XML.
+
+ Args:
+ require_first (bool): If true, then skip a page entirely
+ whenever it is not available in the first input fileGrp.
+ """
+
+ LOG = getLogger('ocrd.processor.base')
+ ifgs = self.input_file_grp.split(",")
+ # Iterating over all files repeatedly may seem inefficient at first sight,
+ # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
+ # can actually be much more costly than traversing the ltree.
+ # This might depend on the number of pages vs number of fileGrps.
+
+ pages = dict()
+ for i, ifg in enumerate(ifgs):
+ for file_ in self.workspace.mets.find_all_files(
+ pageId=self.page_id, fileGrp=ifg, mimetype=mimetype):
+ if not file_.pageId:
+ continue
+ LOG.debug("adding page %s to input file group %s", file_.pageId, ifg)
+ ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
+ ift[i] = file_
+ ifts = list()
+ for page, ifiles in pages.items():
+ for i, ifg in enumerate(ifgs):
+ if not ifiles[i]:
+ # other fallback options?
+ LOG.error('found no page %s in file group %s',
+ page, ifg)
+ if ifiles[0] or not require_first:
+ ifts.append(tuple(ifiles))
+ return ifts
diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
@@ -2,10 +2,10 @@
 
 from tempfile import TemporaryDirectory
 from os.path import join
-from tests.base import TestCase, assets, main # pylint: disable=import-error, no-name-in-module
+from tests.base import CapturingTestCase as TestCase, assets, main # pylint: disable=import-error, no-name-in-module
 from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, IncompleteProcessor, DUMMY_TOOL
 
-from ocrd_utils import MIMETYPE_PAGE
+from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging
 from ocrd.resolver import Resolver
 from ocrd.processor.base import Processor, run_processor, run_cli
 
@@ -89,5 +89,32 @@ def test_run_cli(self):
  resolver=Resolver(),
  )
 
+ def test_zip_input_files(self):
+ class ZipTestProcessor(Processor): pass
+ with pushd_popd(tempdir=True) as tempdir:
+ ws = self.resolver.workspace_from_nothing(directory=tempdir)
+ ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
+ ws.add_file('GRP2', mimetype='application/alto+xml', ID='foobar2', pageId='phys_0001')
+ ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002')
+ ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002')
+ proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
+ tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')]
+ assert ('foobar1', 'foobar2') in tuples
+ assert ('foobar3', 'foobar4') in tuples
+
+ def test_zip_input_files_require_first(self):
+ initLogging()
+ class ZipTestProcessor(Processor): pass
+ self.capture_out_err()
+ with pushd_popd(tempdir=True) as tempdir:
+ ws = self.resolver.workspace_from_nothing(directory=tempdir)
+ ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId=None)
+ ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001')
+ proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
+ assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')]
+ r = self.capture_out_err()
+ assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err
+
+
 if __name__ == "__main__":
  main(__file__)