Skip to content

Commit

Permalink
ROB: Rebuild xref table if one entry is invalid (#2528)
Browse files Browse the repository at this point in the history
Fixes #2523

Situation met:
* length field is not correct
* xref may contain unordered stream data
* xref contains some free entries (i.e. does not contain stream offset)
  • Loading branch information
pubpub-zz authored Mar 24, 2024
1 parent c4641d1 commit f8edf3c
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 2 deletions.
6 changes: 6 additions & 0 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1274,6 +1274,11 @@ def get_object(
self.stream.seek(start, 0)
try:
idnum, generation = self.read_object_header(self.stream)
if (
idnum != indirect_reference.idnum
or generation != indirect_reference.generation
):
raise PdfReadError("not matching, we parse the file for it")
except Exception:
if hasattr(self.stream, "getbuffer"):
buf = bytes(self.stream.getbuffer())
Expand Down Expand Up @@ -1452,6 +1457,7 @@ def read(self, stream: StreamType) -> None:
try:
pid, _pgen = self.read_object_header(stream)
except ValueError:
self._rebuild_xref_table(stream)
break
if pid == id - self.xref_index:
# fixing index item per item is required for revised PDF.
Expand Down
10 changes: 8 additions & 2 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1290,8 +1290,6 @@ def test_reader(caplog):
caplog.clear()
# first call requires some reparations...
reader.pages[0].extract_text()
assert "repaired" in caplog.text
assert "found" in caplog.text
caplog.clear()
# ...and now no more required
reader.pages[0].extract_text()
Expand Down Expand Up @@ -1498,3 +1496,11 @@ def test_xyz_with_missing_param():
assert reader.outline[0]["/Top"] == 0
assert reader.outline[1]["/Left"] == 0
assert reader.outline[0]["/Top"] == 0


@pytest.mark.enable_socket()
def test_corrupted_xref():
url = "https:/py-pdf/pypdf/files/14628314/iss2516.pdf"
name = "iss2516.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert reader.root_object["/Type"] == "/Catalog"

0 comments on commit f8edf3c

Please sign in to comment.