Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cache functionality #875

Merged
merged 51 commits into from
Nov 23, 2022
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
db38852
Cache functionality added. Main tests pass
MehmedGIT May 25, 2022
60e1a60
Fixes and cache tests added
MehmedGIT May 30, 2022
d8c2f50
Fix typo
MehmedGIT May 31, 2022
ef1c757
Applying the changes suggested by kba
MehmedGIT Jun 1, 2022
2414e63
page_cache and fptr_cache added
MehmedGIT Jun 3, 2022
d7d196e
Add the missed page_cache in clearCache
MehmedGIT Jun 3, 2022
0058823
Fixing some bugs
MehmedGIT Jun 7, 2022
ce3ffc8
Extend tests for 200 pages
MehmedGIT Jun 8, 2022
39bdf5e
Comment out test case for 200 pages - takes too long
MehmedGIT Jun 8, 2022
f8d3ac2
No change. Trigger scrutinizer again.
MehmedGIT Jun 15, 2022
b423d1d
Include extreme example benchmarking tests
MehmedGIT Jun 28, 2022
dc6e387
Extreme benchmark test for 750 files per page (5000 pages)
MehmedGIT Oct 4, 2022
e86b8c2
clean the changes
MehmedGIT Oct 12, 2022
f1e6597
To keep the cache_functionality branch up-to-date
MehmedGIT Oct 12, 2022
9ff4d26
Cache functionality after master merge
MehmedGIT Oct 12, 2022
2326a94
Test for 500 pages and 1500 files per page
MehmedGIT Oct 12, 2022
90212ea
Fix the test case
MehmedGIT Oct 12, 2022
50e0f95
Remove the conflicting file
MehmedGIT Oct 24, 2022
afa0162
Return back ocrd_mets
MehmedGIT Oct 24, 2022
6ed7c07
Merge branch 'master' into cache_functionality
MehmedGIT Oct 24, 2022
f997e5a
Cache functionality added again
MehmedGIT Oct 24, 2022
bdf5741
Fix missing parts
MehmedGIT Oct 24, 2022
ffe97cc
Fix the returned constructor with caching flag
MehmedGIT Oct 24, 2022
335d8d2
Fix cache tests
MehmedGIT Oct 24, 2022
a70bf58
test_ocrd_mets_cache: adapt to changed behavior because of caching
kba Nov 3, 2022
2206ccd
remove unnecessary else indent
kba Nov 3, 2022
d095fa6
use log.debug instead print
kba Nov 3, 2022
3f9348a
mets caching: iterate only over actual fileGrp elements
kba Nov 3, 2022
06d22af
mets caching: iterate only over mets:div[@TYPE="page"]
kba Nov 3, 2022
bd04777
mets caching: use fileGrp cache for OcrdMets.file_groups
kba Nov 3, 2022
78715a7
Merge branch 'master' into cache_functionality-kba
kba Nov 3, 2022
ae75e37
Merge pull request #944 from OCR-D/cache_functionality-kba
MehmedGIT Nov 4, 2022
5b50ca0
OcrdMets.find_files: allow mixing regex, range and literal multi-valu…
kba Nov 17, 2022
6fd0220
generate_range: raise ValueError if start == end
kba Nov 17, 2022
9cf0d9c
generate_range: choose the last number in a string
kba Nov 17, 2022
016a370
Merge branch 'master' into cache_functionality
kba Nov 20, 2022
c9e1180
separate targets benchmark{,-extreme} for the METS benchmarks
kba Nov 20, 2022
6522e54
test_ocrd_mets: combine cachinig and non-caching tests
kba Nov 20, 2022
f7a0f5b
merge #955
kba Nov 20, 2022
a6656da
Add fileGrp parameter to remove function
MehmedGIT Nov 21, 2022
4e4b3ee
OcrdMets.__str__: also provide cached/non-cached status
kba Nov 21, 2022
82b3e4f
OcrdMets.__str__: fix it and str test
MehmedGIT Nov 22, 2022
27b6c86
OcrdMets: Don't defend against inconsistency cache vs XML
kba Nov 22, 2022
1e8ff90
OcrdMets: remove outdated comment
kba Nov 22, 2022
ffcd89f
OcrdMets.set_physical_page_for_file: pageId is always a str
kba Nov 22, 2022
4da45f6
OcrdMets: Don't defend against inconsistency cache vs XML
kba Nov 22, 2022
7724191
docstring for OcrdMets.remove_one_file
kba Nov 22, 2022
384b4ac
Merge branch 'cache_functionality' of https:/OCR-D/core i…
kba Nov 22, 2022
2fad30b
revert 4da45f6b3 (el_pagediv can be legitimately None here )
kba Nov 22, 2022
3c5ac1e
enable caching by setting OCRD_METS_CACHING=true env var
kba Nov 22, 2022
f21a33a
readme: add a stub section on configuration
kba Nov 23, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ocrd_models/ocrd_models/ocrd_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ def pageId(self, pageId):
raise Exception("OcrdFile %s has no member 'mets' pointing to parent OcrdMets" % self)
self.mets.set_physical_page_for_file(pageId, self)


@property
def loctype(self):
"""
Expand Down
361 changes: 307 additions & 54 deletions ocrd_models/ocrd_models/ocrd_mets.py

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion ocrd_models/ocrd_models/ocrd_xml_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@ class OcrdXmlDocument():
Base class for XML documents loaded from either content or filename.
"""

def __init__(self, filename=None, content=None):
def __init__(self, filename=None, content=None, cache_flag=False):
"""
Args:
filename (string):
content (string):
cache_flag (bool):
"""
# print(self, filename, content)
if filename is None and content is None:
Expand All @@ -34,6 +35,9 @@ def __init__(self, filename=None, content=None):
raise Exception('File does not exist: %s' % filename)
self._tree.parse(filename)

# Cache enabled - True/False
self._cache_flag = cache_flag

def to_xml(self, xmllint=False):
"""
Serialize all properties as pretty-printed XML
Expand Down
1 change: 1 addition & 0 deletions requirements_test.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
autopep8
pytest >= 4.0.0
generateDS == 2.35.20
pytest-benchmark >= 3.2.3
coverage >= 4.5.2
sphinx
sphinx_click
Expand Down
277 changes: 277 additions & 0 deletions tests/model/mets_bench_extreme.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
# -*- coding: utf-8 -*-

from contextlib import contextmanager
from time import time

from pytest import main, fixture, mark

from ocrd import Resolver
from ocrd_utils import MIME_TO_EXT, getLogger
from ocrd_models import OcrdMets

logger = getLogger('ocrd.benchmark.mets')

GRPS_REG = ['SEG-REG', 'SEG-REPAIR', 'SEG-REG-DESKEW', 'SEG-REG-DESKEW-CLIP', 'SEG-LINE', 'SEG-REPAIR-LINE', 'SEG-LINE-RESEG-DEWARP']
GRPS_IMG = ['FULL', 'PRESENTATION', 'BIN', 'CROP', 'BIN2', 'BIN-DENOISE', 'BIN-DENOISE-DESKEW', 'OCR']

REGIONS_PER_PAGE = 2
LINES_PER_REGION = 2
FILES_PER_PAGE = len(GRPS_IMG) * LINES_PER_REGION + len(GRPS_REG) * REGIONS_PER_PAGE

# Caching is disabled by default
def _build_mets(number_of_pages, force=False, cache_flag=False):
mets = OcrdMets.empty_mets(cache_flag=cache_flag)
mets._number_of_pages = number_of_pages

for n in ['%04d' % (n + 1) for n in range(number_of_pages)]:
_add_file = lambda n, fileGrp, mimetype, ID=None: mets.add_file(
fileGrp,
mimetype=mimetype,
pageId='PHYS_%s' % n,
ID=ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()),
url='%s/%s%s' % (fileGrp, ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), MIME_TO_EXT.get(mimetype))
)
for grp in GRPS_IMG:
# LINES_PER_REGION = 2
_add_file(n, grp, 'image/tiff')
_add_file(n, grp, 'application/vnd.prima.page+xml')
for grp in GRPS_REG:
# REGIONS_PER_PAGE = 2
for region_n in range(REGIONS_PER_PAGE):
_add_file(n, grp, 'image/png', '%s_%s_region%s' % (grp, n, region_n))

return mets

def assert_len(expected_len, mets, kwargs):
test_list = mets.find_all_files(**kwargs)
assert expected_len == len(test_list)

def benchmark_find_files(number_of_pages, mets):
benchmark_find_files_filegrp(number_of_pages, mets)
benchmark_find_files_fileid(number_of_pages, mets)
benchmark_find_files_physical_page(number_of_pages, mets)
# This is not really useful to measure.
# We iterate all files in both cached and non-cached in the same routine
# When no specific search parameters are provided
# benchmark_find_files_all(number_of_pages, mets)

def benchmark_find_files_filegrp(number_of_pages, mets):
# Best case - first fileGrp
assert_len((number_of_pages * REGIONS_PER_PAGE), mets, dict(fileGrp='SEG-REG'))
# Worst case - does not exist
assert_len(0, mets, dict(fileGrp='SEG-REG-NOTEXIST'))

def benchmark_find_files_fileid(number_of_pages, mets):
# Best case - first file ID
assert_len(1, mets, dict(ID='FULL_0001_TIF', fileGrp='FULL'))
# Worst case - does not exist
assert_len(0, mets, dict(ID='FULL_0001_TIF-NOTEXISTS', fileGrp='FULL-NOTEXIST'))

def benchmark_find_files_physical_page(number_of_pages, mets):
# Best case - first physical page
assert_len(FILES_PER_PAGE, mets, dict(pageId='PHYS_0001'))
# Worst case - does not exist
assert_len(0, mets, dict(pageId='PHYS_0001-NOTEXISTS'))

# Get all files, i.e., pass an empty search parameter -> dict()
def benchmark_find_files_all(number_of_pages, mets):
assert_len((number_of_pages * FILES_PER_PAGE), mets, dict())




# ---- BENCHMARKING for 50-500-1000-2000-5000 pages ---- #

# ----- 50 pages -> build, search, build (cached), search (cached) ----- #
mets_50 = None
@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_b50(benchmark):
@benchmark
def result():
global mets_50
mets_50 = _build_mets(50, force=True)

@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_s50(benchmark):
@benchmark
def ret():
global mets_50
benchmark_find_files(50, mets_50)
del mets_50

mets_c_50 = None
@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_b50_c(benchmark):
@benchmark
def result():
global mets_c_50
mets_c_50 = _build_mets(50, force=True, cache_flag=True)

@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_s50_c(benchmark):
@benchmark
def ret():
global mets_c_50
benchmark_find_files(50, mets_c_50)
del mets_c_50
# ----------------------------------------------------------------------- #



# ----- 500 pages -> build, search, build (cached), search (cached) ----- #
mets_500 = None
@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_b500(benchmark):
@benchmark
def result():
global mets_500
mets_500 = _build_mets(500, force=True)

@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_s500(benchmark):
@benchmark
def ret():
global mets_500
benchmark_find_files(500, mets_500)
del mets_500


mets_c_500 = None
@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_b500_c(benchmark):
@benchmark
def result():
global mets_c_500
mets_c_500 = _build_mets(500, force=True, cache_flag=True)

@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_s500_c(benchmark):
@benchmark
def ret():
global mets_c_500
benchmark_find_files(500, mets_c_500)
del mets_c_500

# ----------------------------------------------------------------------- #



# ----- 1000 pages -> build, search, build (cached), search (cached) ----- #
mets_1000 = None
@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_b1000(benchmark):
@benchmark
def result():
global mets_1000
mets_1000 = _build_mets(1000, force=True)

@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_s1000(benchmark):
@benchmark
def ret():
global mets_1000
benchmark_find_files(1000, mets_1000)
del mets_1000

mets_c_1000 = None
@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_b1000_c(benchmark):
@benchmark
def result():
global mets_c_1000
mets_c_1000 = _build_mets(1000, force=True, cache_flag=True)

@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_s1000_c(benchmark):
@benchmark
def ret():
global mets_c_1000
benchmark_find_files(1000, mets_c_1000)
del mets_c_1000

# ------------------------------------------------------------------------ #



# ----- 2000 pages -> build, search, build (cached), search (cached) ----- #
mets_2000 = None
@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_b2000(benchmark):
@benchmark
def result():
global mets_2000
mets_2000 = _build_mets(2000, force=True)

@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_s2000(benchmark):
@benchmark
def ret():
global mets_2000
benchmark_find_files(2000, mets_2000)
del mets_2000

mets_c_2000 = None
@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_b2000_c(benchmark):
@benchmark
def result():
global mets_c_2000
mets_c_2000 = _build_mets(2000, force=True, cache_flag=True)

@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_s2000_c(benchmark):
@benchmark
def ret():
global mets_c_2000
benchmark_find_files(2000, mets_c_2000)
del mets_c_2000

# ------------------------------------------------------------------------ #



# ----- 5000 pages -> build, search, build (cached), search (cached) ----- #
mets_5000 = None
@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_b5000(benchmark):
@benchmark
def result():
global mets_5000
mets_5000 = _build_mets(5000, force=True)

@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_s5000(benchmark):
@benchmark
def ret():
global mets_5000
benchmark_find_files(5000, mets_5000)
del mets_5000

mets_c_5000 = None
@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_b5000_c(benchmark):
@benchmark
def result():
global mets_c_5000
mets_c_5000 = _build_mets(5000, force=True, cache_flag=True)

@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
def test_s5000_c(benchmark):
@benchmark
def ret():
global mets_c_5000
benchmark_find_files(5000, mets_c_5000)
del mets_c_5000

# ------------------------------------------------------------------------ #

if __name__ == '__main__':
args = ['']
# args.append('--benchmark-max-time=10')
# args.append('--benchmark-min-time=0.1')
# args.append('--benchmark-warmup=False')
# args.append('--benchmark-disable-gc')
args.append('--benchmark-verbose')
args.append('--benchmark-min-rounds=1')
args.append('--tb=short')
main(args)
Loading