Skip to content

Commit

Permalink
BaseProcessor.input_files: Implement logic for PAGEXML/image in same …
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Jul 31, 2020
1 parent 33175ab commit 84a4e1a
Showing 1 changed file with 23 additions and 3 deletions.
26 changes: 23 additions & 3 deletions ocrd/ocrd/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from click import wrap_text
from time import time
import subprocess
from ocrd_utils import getLogger, VERSION as OCRD_VERSION
from ocrd_utils import getLogger, VERSION as OCRD_VERSION, MIMETYPE_PAGE
from ocrd_validators import ParameterValidator

log = getLogger('ocrd.processor')
Expand Down Expand Up @@ -239,6 +239,26 @@ def process(self):
@property
def input_files(self):
"""
List the input files
List the input files.
- If there's a PAGE-XML for the page, take it (and forget about all
other files for that page)
- Else if there's only one image, take it (and forget about all other
files for that page)
- Otherwise raise an error (complaining that only PAGE-XML warrants
having multiple images for a single page)
(https:/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
"""
return self.workspace.mets.find_files(fileGrp=self.input_file_grp, pageId=self.page_id)
ret = self.workspace.mets.find_files(
fileGrp=self.input_file_grp, pageId=self.page_id, mimetype=MIMETYPE_PAGE)
if ret:
return ret
ret = self.workspace.mets.find_files(
fileGrp=self.input_file_grp, pageId=self.page_id, mimetype="//image/.*")
if self.page_id and len(ret) > 1:
raise ValueError("No PAGE-XML %s in fileGrp '%s' but multiple images." % (
"for page '%s'" % self.page_id if self.page_id else '',
self.input_file_grp
))
return ret

0 comments on commit 84a4e1a

Please sign in to comment.