ROB: Fix errors/warnings on no /Resources within extract_text (#1276)

Look for /Ressources in parents Closes #1272 (in text) Closes #1269 (in Xform)
py-pdf · Aug 28, 2022 · af9c01b · af9c01b
1 parent ceb997d
commit af9c01b
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 2 deletions.
diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py
@@ -1140,7 +1140,15 @@ def _extract_text(
  cmaps: Dict[
  str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str]]
  ] = {}
- resources_dict = cast(DictionaryObject, obj["/Resources"])
+ try:
+ objr = obj
+ while NameObject("/Resources") not in objr:
+ # /Resources can be inherited sometimes so we look to parents
+ objr = objr["/Parent"].get_object()
+ # if no parents we will have no /Resources will be available => an exception wil be raised
+ resources_dict = cast(DictionaryObject, objr["/Resources"])
+ except Exception:
+ return "" # no resources means no text is possible (no font) we consider the file as not damaged, no need to check for TJ or Tj
  if "/Font" in resources_dict:
  for f in cast(DictionaryObject, resources_dict["/Font"]):
  cmaps[f] = build_char_map(f, space_width, obj)

diff --git a/tests/test_page.py b/tests/test_page.py
@@ -238,6 +238,13 @@ def test_extract_text_single_quote_op():
  page.extract_text()
 
 
+def test_no_ressources_on_text_extract():
+ url = "https:/py-pdf/PyPDF2/files/9428434/TelemetryTX_EM.pdf"
+ reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-964029.pdf")))
+ for page in reader.pages:
+ page.extract_text()
+
+
 def test_iss_1142():
  # check fix for problem of context save/restore (q/Q)
  url = "https:/py-pdf/PyPDF2/files/9150656/ST.2019.PDF"
@@ -285,7 +292,7 @@ def test_extract_text_page_pdf_impossible_decode_xform(caplog):
  for page in reader.pages:
  page.extract_text()
  warn_msgs = normalize_warnings(caplog.text)
- assert warn_msgs == [" impossible to decode XFormObject /Meta203"]
+ assert warn_msgs == [""] # text extraction recognise no text
 
 
 def test_extract_text_operator_t_star(): # L1266, L1267