Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Tolerate PDF with invalid xref pointed objects #2335

Merged
merged 13 commits into from
Mar 30, 2024
Merged
19 changes: 19 additions & 0 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,25 @@ def read(self, stream: StreamType) -> None:
# non-zero-index is actually correct
stream.seek(loc, 0) # return to where it was

# remove wrong objects (not pointing to correct structures) - cf #2326
if not self.strict:
loc = stream.tell()
for gen, xref_entry in self.xref.items():
if gen == 65535:
continue
ids = list(xref_entry.keys())
for id in ids:
stream.seek(xref_entry[id], 0)
try:
self.read_object_header(stream)
except ValueError:
logger_warning(
f"Ignoring wrong pointing object {id} {gen} (offset {xref_entry[id]})",
__name__,
)
del xref_entry[id] # we can delete the id, we are parsing ids
stream.seek(loc, 0) # return to where it was

def _basic_validation(self, stream: StreamType) -> None:
"""Ensure file is not empty. Read at most 5 bytes."""
stream.seek(0, os.SEEK_SET)
Expand Down
25 changes: 14 additions & 11 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,9 @@ def test_get_images(src, expected_images):
False,
-1,
False,
["startxref on same line as offset"],
[
"startxref on same line as offset",
],
),
(
False,
Expand Down Expand Up @@ -322,11 +324,12 @@ def test_get_images_raw(
b"%%%%EOF"
)
pdf_data = pdf_data % (
pdf_data.find(b"1 0 obj"),
pdf_data.find(b"2 0 obj"),
pdf_data.find(b"3 0 obj"),
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
# - 1 below in the find because of the double %
pdf_data.find(b"1 0 obj") - 1,
pdf_data.find(b"2 0 obj") - 1,
pdf_data.find(b"3 0 obj") - 1,
pdf_data.find(b"4 0 obj") - 1,
pdf_data.find(b"5 0 obj") - 1,
b"/Prev 0 " if with_prev_0 else b"",
# startx_correction should be -1 due to double % at the beginning
# inducing an error on startxref computation
Expand Down Expand Up @@ -593,11 +596,11 @@ def test_read_unknown_zero_pages(caplog):
b"%%%%EOF"
)
pdf_data = pdf_data % (
pdf_data.find(b"1 0 obj"),
pdf_data.find(b"2 0 obj"),
pdf_data.find(b"3 0 obj"),
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
pdf_data.find(b"1 0 obj") - 1,
pdf_data.find(b"2 0 obj") - 1,
pdf_data.find(b"3 0 obj") - 1,
pdf_data.find(b"4 0 obj") - 1,
pdf_data.find(b"5 0 obj") - 1,
pdf_data.find(b"xref") - 1,
)
pdf_stream = io.BytesIO(pdf_data)
Expand Down
6 changes: 5 additions & 1 deletion tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,11 @@ def test_get_fields_warns(tmp_path, caplog, url, name):
retrieved_fields = reader.get_fields(fileobj=fp)

assert retrieved_fields == {}
assert normalize_warnings(caplog.text) == ["Object 2 0 not defined."]
assert normalize_warnings(caplog.text) == [
"Ignoring wrong pointing object 1 65536 (offset 0)",
"Ignoring wrong pointing object 2 65536 (offset 0)",
"Object 2 0 not defined.",
]


@pytest.mark.enable_socket()
Expand Down
Loading