Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ocrd.processor.base: add property zip_input_files #635

Merged
merged 10 commits into from
Nov 3, 2020
54 changes: 50 additions & 4 deletions ocrd/ocrd/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import os
import json
from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE
from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE, getLogger
from ocrd_validators import ParameterValidator
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType

Expand All @@ -15,9 +15,11 @@

class Processor():
"""
A processor runs an algorithm based on the workspace, the mets.xml in the
workspace (and the input files defined therein) as well as optional
parameter.
A processor is an OCR-D compliant command-line-interface for executing
kba marked this conversation as resolved.
Show resolved Hide resolved
a single workflow step on the workspace (represented by local METS). It
reads input files for all or requested physical pages of the input fileGrp(s),
and writes output files for them into the output fileGrp(s). It may take
a number of optional or mandatory parameters.
"""

def __init__(
Expand Down Expand Up @@ -123,3 +125,47 @@ def input_files(self):
self.input_file_grp
))
return ret

def zip_input_files(self, require_first=True, mimetype=MIMETYPE_PAGE):
"""
List tuples of input files (for multiple input file groups).

Processors that expect/need multiple input file groups,
cannot use ``input_files``. They must align (zip) input files
across pages. This includes the case where not all pages
are equally present in all file groups.

This function does not make much sense for non-PAGE fileGrps,
bertsky marked this conversation as resolved.
Show resolved Hide resolved
so it uses a fixed MIME type filter for PAGE-XML.

Args:
require_first (bool): If true, then skip a page entirely
whenever it is not available in the first input fileGrp.
"""

LOG = getLogger('ocrd.processor.base')
ifgs = self.input_file_grp.split(",")
# Iterating over all files repeatedly may seem inefficient at first sight,
# but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
# can actually be much more costly than traversing the ltree.
# This might depend on the number of pages vs number of fileGrps.

pages = dict()
for i, ifg in enumerate(ifgs):
for file_ in self.workspace.mets.find_all_files(
pageId=self.page_id, fileGrp=ifg, mimetype=mimetype):
if not file_.pageId:
continue
LOG.debug("adding page %s to input file group %s", file_.pageId, ifg)
ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
ift[i] = file_
ifts = list()
for page, ifiles in pages.items():
for i, ifg in enumerate(ifgs):
if not ifiles[i]:
# other fallback options?
LOG.error('found no page %s in file group %s',
page, ifg)
if ifiles[0] or not require_first:
ifts.append(tuple(ifiles))
return ifts
31 changes: 29 additions & 2 deletions tests/processor/test_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

from tempfile import TemporaryDirectory
from os.path import join
from tests.base import TestCase, assets, main # pylint: disable=import-error, no-name-in-module
from tests.base import CapturingTestCase as TestCase, assets, main # pylint: disable=import-error, no-name-in-module
from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, IncompleteProcessor, DUMMY_TOOL

from ocrd_utils import MIMETYPE_PAGE
from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging
from ocrd.resolver import Resolver
from ocrd.processor.base import Processor, run_processor, run_cli

Expand Down Expand Up @@ -89,5 +89,32 @@ def test_run_cli(self):
resolver=Resolver(),
)

def test_zip_input_files(self):
class ZipTestProcessor(Processor): pass
with pushd_popd(tempdir=True) as tempdir:
ws = self.resolver.workspace_from_nothing(directory=tempdir)
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
ws.add_file('GRP2', mimetype='application/alto+xml', ID='foobar2', pageId='phys_0001')
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002')
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002')
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')]
assert ('foobar1', 'foobar2') in tuples
assert ('foobar3', 'foobar4') in tuples

def test_zip_input_files_require_first(self):
initLogging()
class ZipTestProcessor(Processor): pass
self.capture_out_err()
with pushd_popd(tempdir=True) as tempdir:
ws = self.resolver.workspace_from_nothing(directory=tempdir)
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId=None)
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001')
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')]
r = self.capture_out_err()
assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err


if __name__ == "__main__":
main(__file__)