TST: writer.remove_text (#946)

py-pdf · Jun 5, 2022 · 1df859c · 1df859c
1 parent 34919f9
commit 1df859c
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 4 deletions.
diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py
@@ -1292,7 +1292,7 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None:
  """
  Remove text from this output.
 
- :param bool ignoreByteStringObject: optional parameter
+ :param bool ignore_byte_string_object: optional parameter
  to ignore ByteString Objects.
  """
  pg_dict = cast(DictionaryObject, self.get_object(self._pages))

diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py
@@ -1097,10 +1097,13 @@ def __init__(self, stream: Any, pdf: Any) -> None:
  data = b_("")
  for s in stream:
  data += b_(s.get_object().get_data())
- stream = BytesIO(b_(data))
+ stream_bytes = BytesIO(b_(data))
  else:
- stream = BytesIO(b_(stream.get_data()))
- self.__parseContentStream(stream)
+ stream_data = stream.get_data()
+ assert stream_data is not None
+ stream_data_bytes = b_(stream_data)
+ stream_bytes = BytesIO(stream_data_bytes)
+ self.__parseContentStream(stream_bytes)
 
  def __parseContentStream(self, stream: StreamType) -> None:
  # file("f:\\tmp.txt", "w").write(stream.read())

diff --git a/tests/test_writer.py b/tests/test_writer.py
@@ -132,6 +132,76 @@ def test_remove_text(input_path, ignore_byte_string_object):
  os.remove(tmp_filename)
 
 
+@pytest.mark.parametrize(
+ ("ignore_byte_string_object"),
+ [False, True],
+)
+def test_remove_text_all_operators(ignore_byte_string_object):
+ stream = (
+ b"BT "
+ b"/F0 36 Tf "
+ b"50 706 Td "
+ b"36 TL "
+ b"(The Tj operator) Tj "
+ b'1 2 (The double quote operator) " '
+ b"(The single quote operator) ' "
+ b"ET"
+ )
+ pdf_data = (
+ b"%%PDF-1.7\n"
+ b"1 0 obj << /Count 1 /Kids [5 0 R] /Type /Pages >> endobj\n"
+ b"2 0 obj << >> endobj\n"
+ b"3 0 obj << >> endobj\n"
+ b"4 0 obj << /Length %d >>\n"
+ b"stream\n" + (b"%s\n" % stream) + b"endstream\n"
+ b"endobj\n"
+ b"5 0 obj << /Contents 4 0 R /CropBox [0.0 0.0 2550.0 3508.0]\n"
+ b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
+ b" /Resources << /Font << >> >>"
+ b" /Rotate 0 /Type /Page >> endobj\n"
+ b"6 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n"
+ b"xref 1 6\n"
+ b"%010d 00000 n\n"
+ b"%010d 00000 n\n"
+ b"%010d 00000 n\n"
+ b"%010d 00000 n\n"
+ b"%010d 00000 n\n"
+ b"%010d 00000 n\n"
+ b"trailer << /Root 6 0 R /Size 6 >>\n"
+ b"startxref\n%d\n"
+ b"%%%%EOF"
+ )
+ startx_correction = -1
+ pdf_data = pdf_data % (
+ len(stream),
+ pdf_data.find(b"1 0 obj") + startx_correction,
+ pdf_data.find(b"2 0 obj") + startx_correction,
+ pdf_data.find(b"3 0 obj") + startx_correction,
+ pdf_data.find(b"4 0 obj") + startx_correction,
+ pdf_data.find(b"5 0 obj") + startx_correction,
+ pdf_data.find(b"6 0 obj") + startx_correction,
+ # startx_correction should be -1 due to double % at the beginning indiducing an error on startxref computation
+ pdf_data.find(b"xref"),
+ )
+ print(pdf_data.decode())
+ pdf_stream = BytesIO(pdf_data)
+
+ reader = PdfReader(pdf_stream, strict=False)
+ writer = PdfWriter()
+
+ page = reader.pages[0]
+ writer.insert_page(page, 0)
+ writer.remove_text(ignore_byte_string_object=ignore_byte_string_object)
+
+ # finally, write "output" to PyPDF2-output.pdf
+ tmp_filename = "dont_commit_writer_removed_text.pdf"
+ with open(tmp_filename, "wb") as output_stream:
+ writer.write(output_stream)
+
+ # Cleanup
+ os.remove(tmp_filename)
+
+
 def test_write_metadata():
  pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf")