Skip to content

Commit

Permalink
ROB: Fix errors/warnings on no /Resources within extract_text (#1276)
Browse files Browse the repository at this point in the history
Look for /Ressources in parents

Closes  #1272 (in text)
Closes #1269 (in Xform)
  • Loading branch information
pubpub-zz authored Aug 28, 2022
1 parent ceb997d commit af9c01b
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 2 deletions.
10 changes: 9 additions & 1 deletion PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1140,7 +1140,15 @@ def _extract_text(
cmaps: Dict[
str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str]]
] = {}
resources_dict = cast(DictionaryObject, obj["/Resources"])
try:
objr = obj
while NameObject("/Resources") not in objr:
# /Resources can be inherited sometimes so we look to parents
objr = objr["/Parent"].get_object()
# if no parents we will have no /Resources will be available => an exception wil be raised
resources_dict = cast(DictionaryObject, objr["/Resources"])
except Exception:
return "" # no resources means no text is possible (no font) we consider the file as not damaged, no need to check for TJ or Tj
if "/Font" in resources_dict:
for f in cast(DictionaryObject, resources_dict["/Font"]):
cmaps[f] = build_char_map(f, space_width, obj)
Expand Down
9 changes: 8 additions & 1 deletion tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,13 @@ def test_extract_text_single_quote_op():
page.extract_text()


def test_no_ressources_on_text_extract():
url = "https:/py-pdf/PyPDF2/files/9428434/TelemetryTX_EM.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-964029.pdf")))
for page in reader.pages:
page.extract_text()


def test_iss_1142():
# check fix for problem of context save/restore (q/Q)
url = "https:/py-pdf/PyPDF2/files/9150656/ST.2019.PDF"
Expand Down Expand Up @@ -285,7 +292,7 @@ def test_extract_text_page_pdf_impossible_decode_xform(caplog):
for page in reader.pages:
page.extract_text()
warn_msgs = normalize_warnings(caplog.text)
assert warn_msgs == [" impossible to decode XFormObject /Meta203"]
assert warn_msgs == [""] # text extraction recognise no text


def test_extract_text_operator_t_star(): # L1266, L1267
Expand Down

0 comments on commit af9c01b

Please sign in to comment.