From 3a6e4d0d42351fdea8e7ce0efe814684bd9b7497 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 30 Mar 2024 10:00:06 +0100 Subject: [PATCH] ENH: Tolerate PDF with invalid xref pointed objects (#2335) Closes #2326 --- pypdf/_reader.py | 19 +++++++++++++++++++ tests/test_reader.py | 25 ++++++++++++++----------- tests/test_workflows.py | 6 +++++- 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index eabdc52f1..54fb33f1b 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -573,6 +573,25 @@ def read(self, stream: StreamType) -> None: # non-zero-index is actually correct stream.seek(loc, 0) # return to where it was + # remove wrong objects (not pointing to correct structures) - cf #2326 + if not self.strict: + loc = stream.tell() + for gen, xref_entry in self.xref.items(): + if gen == 65535: + continue + ids = list(xref_entry.keys()) + for id in ids: + stream.seek(xref_entry[id], 0) + try: + self.read_object_header(stream) + except ValueError: + logger_warning( + f"Ignoring wrong pointing object {id} {gen} (offset {xref_entry[id]})", + __name__, + ) + del xref_entry[id] # we can delete the id, we are parsing ids + stream.seek(loc, 0) # return to where it was + def _basic_validation(self, stream: StreamType) -> None: """Ensure file is not empty. Read at most 5 bytes.""" stream.seek(0, os.SEEK_SET) diff --git a/tests/test_reader.py b/tests/test_reader.py index 20cb7dd9d..f8be72dc1 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -268,7 +268,9 @@ def test_get_images(src, expected_images): False, -1, False, - ["startxref on same line as offset"], + [ + "startxref on same line as offset", + ], ), ( False, @@ -322,11 +324,12 @@ def test_get_images_raw( b"%%%%EOF" ) pdf_data = pdf_data % ( - pdf_data.find(b"1 0 obj"), - pdf_data.find(b"2 0 obj"), - pdf_data.find(b"3 0 obj"), - pdf_data.find(b"4 0 obj"), - pdf_data.find(b"5 0 obj"), + # - 1 below in the find because of the double % + pdf_data.find(b"1 0 obj") - 1, + pdf_data.find(b"2 0 obj") - 1, + pdf_data.find(b"3 0 obj") - 1, + pdf_data.find(b"4 0 obj") - 1, + pdf_data.find(b"5 0 obj") - 1, b"/Prev 0 " if with_prev_0 else b"", # startx_correction should be -1 due to double % at the beginning # inducing an error on startxref computation @@ -593,11 +596,11 @@ def test_read_unknown_zero_pages(caplog): b"%%%%EOF" ) pdf_data = pdf_data % ( - pdf_data.find(b"1 0 obj"), - pdf_data.find(b"2 0 obj"), - pdf_data.find(b"3 0 obj"), - pdf_data.find(b"4 0 obj"), - pdf_data.find(b"5 0 obj"), + pdf_data.find(b"1 0 obj") - 1, + pdf_data.find(b"2 0 obj") - 1, + pdf_data.find(b"3 0 obj") - 1, + pdf_data.find(b"4 0 obj") - 1, + pdf_data.find(b"5 0 obj") - 1, pdf_data.find(b"xref") - 1, ) pdf_stream = io.BytesIO(pdf_data) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 1a31cddad..cc8d5f94e 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -547,7 +547,11 @@ def test_get_fields_warns(tmp_path, caplog, url, name): retrieved_fields = reader.get_fields(fileobj=fp) assert retrieved_fields == {} - assert normalize_warnings(caplog.text) == ["Object 2 0 not defined."] + assert normalize_warnings(caplog.text) == [ + "Ignoring wrong pointing object 1 65536 (offset 0)", + "Ignoring wrong pointing object 2 65536 (offset 0)", + "Object 2 0 not defined.", + ] @pytest.mark.enable_socket()