From 7621d8e6d6c926d1961a66af6255aa02f5f6c1cb Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Thu, 7 Dec 2023 09:49:32 +0100 Subject: [PATCH 1/8] BUG: Relax flate decoding for too many lookup values --- pypdf/_xobj_image_helpers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 515c01ebe..577979db3 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -195,7 +195,14 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: else: if img.mode == "1": # Two values ("high" and "low"). - assert len(lookup) == 2 * nb, len(lookup) + if len(lookup) != 2 * nb: + if len(lookup) < 2 * nb: + raise PdfReadError(f"Not enough lookup values: Expected {2 * nb}, got {len(lookup)}.") + logger_warning( + f"Expected {2 * nb} lookup values, got {len(lookup)}. Ignoring trailing ones.", + __name__, + ) + lookup = lookup[:2 * nb] colors_arr = [lookup[:nb], lookup[nb:]] arr = b"".join( [ From 676abc891069c1f550ae037e9c8886efba8fee2c Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Fri, 8 Dec 2023 08:52:35 +0100 Subject: [PATCH 2/8] only accept whitespace characters as trailing ones --- pypdf/_xobj_image_helpers.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 577979db3..30b69aa9d 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -4,7 +4,7 @@ from io import BytesIO from typing import Any, List, Tuple, Union, cast -from ._utils import logger_warning +from ._utils import logger_warning, WHITESPACES from .constants import ColorSpaces from .errors import PdfReadError from .generic import ( @@ -195,14 +195,13 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: else: if img.mode == "1": # Two values ("high" and "low"). - if len(lookup) != 2 * nb: - if len(lookup) < 2 * nb: - raise PdfReadError(f"Not enough lookup values: Expected {2 * nb}, got {len(lookup)}.") - logger_warning( - f"Expected {2 * nb} lookup values, got {len(lookup)}. Ignoring trailing ones.", - __name__, - ) - lookup = lookup[:2 * nb] + expected_count = 2 * nb + if len(lookup) != expected_count: + if len(lookup) < expected_count: + raise PdfReadError(f"Not enough lookup values: Expected {expected_count}, got {len(lookup)}.") + lookup = lookup[:expected_count] + if not all(_value in WHITESPACES for _value in lookup[expected_count:]): + raise PdfReadError(f"Too many lookup values: Expected {expected_count}, got {len(lookup)}.") colors_arr = [lookup[:nb], lookup[nb:]] arr = b"".join( [ From c34e9fc5f68ff65d2aa941c377a78ecce3666762 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Fri, 8 Dec 2023 08:55:21 +0100 Subject: [PATCH 3/8] fix import sort oder --- pypdf/_xobj_image_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 30b69aa9d..a390357dd 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -4,7 +4,7 @@ from io import BytesIO from typing import Any, List, Tuple, Union, cast -from ._utils import logger_warning, WHITESPACES +from ._utils import WHITESPACES, logger_warning from .constants import ColorSpaces from .errors import PdfReadError from .generic import ( From 73308dc17659d02f4b0ea25c5a8c3204c6960db2 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Fri, 8 Dec 2023 08:58:16 +0100 Subject: [PATCH 4/8] add test with trailing newline --- tests/test_filters.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_filters.py b/tests/test_filters.py index 00a548ab0..72d6f76cc 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -649,3 +649,12 @@ def test_flate_decode_with_image_mode_1(): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for image in reader.pages[7].images: _ = image + + +@pytest.mark.enable_socket() +def test_flate_decode_with_image_mode_1__whitespace_at_end_of_lookup(): + """From #2331""" + url = "https://github.com/py-pdf/pypdf/files/13611048/out1.pdf" + name = "issue2331.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader.pages[0].images[0] From 52fdd5ca7ea5d4ab098fd57e3bbed984bbf60d88 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 9 Dec 2023 13:45:36 +0100 Subject: [PATCH 5/8] ENH : tolerate missing EOD at end of RunLengthDecode --- pypdf/filters.py | 6 ++++-- tests/test_filters.py | 16 ++++++---------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/pypdf/filters.py b/pypdf/filters.py index d1c06a341..499a03576 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -285,7 +285,8 @@ def decode( index = 0 while True: if index >= len(data): - raise PdfStreamError("Unexpected EOD in ASCIIHexDecode") + break # reach End Of String even if no EOD + # raise PdfStreamError("Unexpected EOD in ASCIIHexDecode") char = data[index : index + 1] if char == b">": break @@ -340,7 +341,8 @@ def decode( index = 0 while True: if index >= len(data): - raise PdfStreamError("Unexpected EOD in RunLengthDecode") + break # reach End Of String even if no EOD + # raise PdfStreamError("Unexpected EOD in ASCIIHexDecode") length = data[index] index += 1 if length == 128: diff --git a/tests/test_filters.py b/tests/test_filters.py index 72d6f76cc..54e8b45e2 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -11,7 +11,7 @@ from PIL import Image from pypdf import PdfReader -from pypdf.errors import DeprecationError, PdfReadError, PdfStreamError +from pypdf.errors import DeprecationError, PdfReadError from pypdf.filters import ( ASCII85Decode, ASCIIHexDecode, @@ -131,9 +131,9 @@ def test_ascii_hex_decode_method(data, expected): def test_ascii_hex_decode_missing_eod(): """ASCIIHexDecode.decode() raises error when no EOD character is present.""" - with pytest.raises(PdfStreamError) as exc: - ASCIIHexDecode.decode("") - assert exc.value.args[0] == "Unexpected EOD in ASCIIHexDecode" + # with pytest.raises(PdfStreamError) as exc: + ASCIIHexDecode.decode("") + # assert exc.value.args[0] == "Unexpected EOD in ASCIIHexDecode" @pytest.mark.enable_socket() @@ -561,14 +561,10 @@ def test_runlengthdecode(): url = "https://github.com/py-pdf/pypdf/files/12162905/out.pdf" name = "FailedRLE1.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - with pytest.raises(PdfStreamError) as exc: - reader.pages[0].images[0] - assert exc.value.args[0] == "Unexpected EOD in RunLengthDecode" + reader.pages[0].images[0] url = "https://github.com/py-pdf/pypdf/files/12162926/out.pdf" name = "FailedRLE2.pdf" - with pytest.raises(PdfStreamError) as exc: - reader.pages[0].images[0] - assert exc.value.args[0] == "Unexpected EOD in RunLengthDecode" + reader.pages[0].images[0] @pytest.mark.enable_socket() From f23b96347377886229398020df2ebfd564c84554 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 9 Dec 2023 13:55:12 +0100 Subject: [PATCH 6/8] ENH : tolerate pdf with invalid xref pointed objects closes#2326 --- pypdf/_reader.py | 19 +++++++++++++++++++ pypdf/_writer.py | 2 +- tests/test_filters.py | 2 +- tests/test_reader.py | 27 +++++++++++++++------------ tests/test_workflows.py | 6 +++++- 5 files changed, 41 insertions(+), 15 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 73af2dc35..5061885b0 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -1588,6 +1588,25 @@ def read(self, stream: StreamType) -> None: # non-zero-index is actually correct stream.seek(loc, 0) # return to where it was + # remove wrong objects( not pointing to correct structures) - cf #2326 + if not self.strict: + loc = stream.tell() + for gen, xref_entry in self.xref.items(): + if gen == 65535: + continue + ids = list(xref_entry.keys()) + for id in ids: + stream.seek(xref_entry[id], 0) + try: + self.read_object_header(stream) + except ValueError: + logger_warning( + f"ignore wrong pointing object {id} {gen} (offset {xref_entry[id]})", + __name__, + ) + del xref_entry[id] # we can delete the id, we are parsing ids + stream.seek(loc, 0) # return to where it was + def _basic_validation(self, stream: StreamType) -> None: """Ensure file is not empty. Read at most 5 bytes.""" stream.seek(0, os.SEEK_SET) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index e4db6e32e..0e9cc359b 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -3312,7 +3312,7 @@ def _get_filtered_outline( if node is None: node = NullObject() node = node.get_object() - if isinstance(node, NullObject): + if isinstance(node, NullObject) or node is None: node = DictionaryObject() if node.get("/Type", "") == "/Outlines" or "/Title" not in node: node = node.get("/First", None) diff --git a/tests/test_filters.py b/tests/test_filters.py index 54e8b45e2..871331da2 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -561,7 +561,7 @@ def test_runlengthdecode(): url = "https://github.com/py-pdf/pypdf/files/12162905/out.pdf" name = "FailedRLE1.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - reader.pages[0].images[0] + reader.pages[0].images[0] # now works url = "https://github.com/py-pdf/pypdf/files/12162926/out.pdf" name = "FailedRLE2.pdf" reader.pages[0].images[0] diff --git a/tests/test_reader.py b/tests/test_reader.py index b252e48f9..7688f3281 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -265,7 +265,9 @@ def test_get_images(src, expected_images): False, -1, False, - ["startxref on same line as offset"], + [ + "startxref on same line as offset", + ], ), ( False, @@ -319,11 +321,12 @@ def test_get_images_raw( b"%%%%EOF" ) pdf_data = pdf_data % ( - pdf_data.find(b"1 0 obj"), - pdf_data.find(b"2 0 obj"), - pdf_data.find(b"3 0 obj"), - pdf_data.find(b"4 0 obj"), - pdf_data.find(b"5 0 obj"), + # - 1 below in the find because of the double % + pdf_data.find(b"1 0 obj") - 1, + pdf_data.find(b"2 0 obj") - 1, + pdf_data.find(b"3 0 obj") - 1, + pdf_data.find(b"4 0 obj") - 1, + pdf_data.find(b"5 0 obj") - 1, b"/Prev 0 " if with_prev_0 else b"", # startx_correction should be -1 due to double % at the beginning # inducing an error on startxref computation @@ -589,11 +592,11 @@ def test_read_unknown_zero_pages(caplog): b"%%%%EOF" ) pdf_data = pdf_data % ( - pdf_data.find(b"1 0 obj"), - pdf_data.find(b"2 0 obj"), - pdf_data.find(b"3 0 obj"), - pdf_data.find(b"4 0 obj"), - pdf_data.find(b"5 0 obj"), + pdf_data.find(b"1 0 obj") - 1, + pdf_data.find(b"2 0 obj") - 1, + pdf_data.find(b"3 0 obj") - 1, + pdf_data.find(b"4 0 obj") - 1, + pdf_data.find(b"5 0 obj") - 1, pdf_data.find(b"xref") - 1, ) pdf_stream = io.BytesIO(pdf_data) @@ -1258,7 +1261,7 @@ def test_reader(caplog): caplog.clear() # first call requires some reparations... reader.pages[0].extract_text() - assert "repaired" in caplog.text + # no more true: assert "repaired" in caplog.text assert "found" in caplog.text caplog.clear() # ...and now no more required diff --git a/tests/test_workflows.py b/tests/test_workflows.py index de43fe7a8..6e6ef3cad 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -545,7 +545,11 @@ def test_get_fields_warns(tmp_path, caplog, url, name): retrieved_fields = reader.get_fields(fileobj=fp) assert retrieved_fields == {} - assert normalize_warnings(caplog.text) == ["Object 2 0 not defined."] + assert normalize_warnings(caplog.text) == [ + "ignore wrong pointing object 1 65536 (offset 0)", + "ignore wrong pointing object 2 65536 (offset 0)", + "Object 2 0 not defined.", + ] @pytest.mark.enable_socket() From 46c1387b66c0196a2ecf8979015cda5b7a19eca4 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Sat, 30 Mar 2024 09:45:30 +0100 Subject: [PATCH 7/8] improve messages/comments --- pypdf/_reader.py | 4 ++-- tests/test_workflows.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index a48c8883f..54fb33f1b 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -573,7 +573,7 @@ def read(self, stream: StreamType) -> None: # non-zero-index is actually correct stream.seek(loc, 0) # return to where it was - # remove wrong objects( not pointing to correct structures) - cf #2326 + # remove wrong objects (not pointing to correct structures) - cf #2326 if not self.strict: loc = stream.tell() for gen, xref_entry in self.xref.items(): @@ -586,7 +586,7 @@ def read(self, stream: StreamType) -> None: self.read_object_header(stream) except ValueError: logger_warning( - f"ignore wrong pointing object {id} {gen} (offset {xref_entry[id]})", + f"Ignoring wrong pointing object {id} {gen} (offset {xref_entry[id]})", __name__, ) del xref_entry[id] # we can delete the id, we are parsing ids diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 66264a610..cc8d5f94e 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -548,8 +548,8 @@ def test_get_fields_warns(tmp_path, caplog, url, name): assert retrieved_fields == {} assert normalize_warnings(caplog.text) == [ - "ignore wrong pointing object 1 65536 (offset 0)", - "ignore wrong pointing object 2 65536 (offset 0)", + "Ignoring wrong pointing object 1 65536 (offset 0)", + "Ignoring wrong pointing object 2 65536 (offset 0)", "Object 2 0 not defined.", ] From 550aa6eecf69416a753e2592ded112545a3b95b0 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 30 Mar 2024 09:52:34 +0100 Subject: [PATCH 8/8] error in merge --- tests/test_reader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index 52e9e15e1..f8be72dc1 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1294,7 +1294,6 @@ def test_reader(caplog): caplog.clear() # first call requires some reparations... reader.pages[0].extract_text() - assert "found" in caplog.text caplog.clear() # ...and now no more required reader.pages[0].extract_text()