py-pdf · MartinThoma · Feb 26, 2023 · Feb 20, 2023 · Feb 20, 2023 · Feb 20, 2023
diff --git a/pypdf/__init__.py b/pypdf/__init__.py
@@ -12,7 +12,7 @@
 from ._page import PageObject, Transformation
 from ._reader import DocumentInformation, PdfFileReader, PdfReader
 from ._version import __version__
-from ._writer import PdfFileWriter, PdfWriter
+from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter
 from .pagerange import PageRange, parse_filename_page_ranges
 from .papersizes import PaperSize
 
@@ -21,6 +21,7 @@
  "PageRange",
  "PaperSize",
  "DocumentInformation",
+ "ObjectDeletionFlag",
  "parse_filename_page_ranges",
  "PdfFileMerger", # will be removed in pypdf==4.0.0; use PdfMerger instead
  "PdfFileReader", # will be removed in pypdf==4.0.0; use PdfReader instead

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -30,6 +30,7 @@
 import codecs
 import collections
 import decimal
+import enum
 import logging
 import random
 import re
@@ -132,6 +133,15 @@
 ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions((2**31 - 1) - 3)
 
 
+class ObjectDeletionFlag(enum.IntFlag):
+ TEXT = enum.auto()
+ IMAGES = enum.auto()
+ LINKS = enum.auto()
+ ATTACHMENTS = enum.auto()
+ OBJECTS_3D = enum.auto()
+ ALL_ANNOTATIONS = enum.auto()
+
+
 class PdfWriter:
  """
  Write a PDF file out, given pages produced by another class.
@@ -1796,12 +1806,8 @@ def addNamedDestination(
 
  def remove_links(self) -> None:
  """Remove links and annotations from this output."""
- pg_dict = cast(DictionaryObject, self.get_object(self._pages))
- pages = cast(ArrayObject, pg_dict[PA.KIDS])
- for page in pages:
- page_ref = cast(DictionaryObject, self.get_object(page))
- if PG.ANNOTS in page_ref:
- del page_ref[PG.ANNOTS]
+ for page in self.pages:
+ self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)
 
  def removeLinks(self) -> None: # deprecated
  """
@@ -1812,85 +1818,151 @@ def removeLinks(self) -> None: # deprecated
  deprecation_with_replacement("removeLinks", "remove_links", "3.0.0")
  return self.remove_links()
 
+ def remove_annots(self, subtypes: Optional[Union[str, Iterable[str]]]) -> None:
+ """
+ Remove annotations by Subtype
+ args:
+ subtypes : SubType or list of SubTypes to be removed. None=all
+ """
+ for page in self.pages:
+ self._remove_annots_from_page(page, subtypes)
+
+ def _remove_annots_from_page(
+ self,
+ page: Union[IndirectObject, PageObject, DictionaryObject],
+ subtypes: Optional[Iterable[str]],
+ ) -> None:
+ page = cast(DictionaryObject, page.get_object())
+ if PG.ANNOTS in page:
+ i = 0
+ while i < len(cast(ArrayObject, page[PG.ANNOTS])):
+ an = cast(ArrayObject, page[PG.ANNOTS])[i]
+ obj = cast(DictionaryObject, an.get_object())
+ if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:
+ if isinstance(an, IndirectObject):
+ self._objects[an.idnum - 1] = NullObject() # to reduce PDF size
+ del page[PG.ANNOTS][i] # type:ignore
+ else:
+ i += 1
+
+ def remove_objects_from_page(
+ self,
+ page: Union[PageObject, DictionaryObject],
+ to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],
+ ) -> None:
+ """
+ Remove objects specified by `to_delete` from the given page.
+
+ Args:
+ page: Page object to clean up
+ to_delete: Objects to be deleted; can be a `ObjectDeletionFlag` or a list of ObjectDeletionFlag
+ """
+ if isinstance(to_delete, (list, tuple)):
+ for to_d in to_delete:
+ self.remove_objects_from_page(page, to_d)
+ return
+ assert isinstance(to_delete, ObjectDeletionFlag)
+
+ if to_delete & ObjectDeletionFlag.LINKS:
+ return self._remove_annots_from_page(page, ("/Link",))
+ if to_delete & ObjectDeletionFlag.ATTACHMENTS:
+ return self._remove_annots_from_page(
+ page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")
+ )
+ if to_delete & ObjectDeletionFlag.OBJECTS_3D:
+ return self._remove_annots_from_page(page, ("/3D",))
+ if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:
+ return self._remove_annots_from_page(page, None)
+
+ if to_delete & ObjectDeletionFlag.IMAGES:
+ jump_operators = (
+ [b"w", b"J", b"j", b"M", b"d", b"i"]
+ + [b"W", b"W*"]
+ + [b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n"]
+ + [b"m", b"l", b"c", b"v", b"y", b"h", b"re"]
+ + [b"sh"]
+ )
+ else: # del text
+ jump_operators = [b"Tj", b"TJ", b"'", b'"']
+
+ images = []
+ forms = []
+
+ def clean(content: ContentStream) -> None:
+ nonlocal images, forms, to_delete
+ i = 0
+ while i < len(content.operations):
+ operands, operator = content.operations[i]
+ if operator in jump_operators:
+ del content.operations[i]
+ elif operator == b"Do":
+ if (
+ cast(ObjectDeletionFlag, to_delete) & ObjectDeletionFlag.IMAGES
+ and operands[0] in images
+ or cast(ObjectDeletionFlag, to_delete) & ObjectDeletionFlag.TEXT
+ and operands[0] in forms
+ ):
+ del content.operations[i]
+ i += 1
+ else:
+ i += 1
+
+ try:
+ d = cast(dict, cast(DictionaryObject, page["/Resources"])["/XObject"])
+ except KeyError:
+ d = {}
+ for k, v in d.items():
+ o = v.get_object()
+ try:
+ content: Any = None
+ if to_delete & ObjectDeletionFlag.IMAGES and o["/Subtype"] == "/Image":
+ content = NullObject()
+ images.append(k)
+ if o["/Subtype"] == "/Form":
+ forms.append(k)
+ if isinstance(o, ContentStream):
+ content = o
+ else:
+ content = ContentStream(o, self)
+ content.update(o.items())
+ for k1 in ["/Length", "/Filter", "/DecodeParms"]:
+ try:
+ del content[k1]
+ except KeyError:
+ pass
+ clean(content)
+ if content is not None:
+ if isinstance(v, IndirectObject):
+ self._objects[v.idnum - 1] = content
+ else:
+ d[k] = self._add_object(content)
+ except (TypeError, KeyError):
+ pass
+ if "/Contents" in page:
+ content = page["/Contents"].get_object()
+ if not isinstance(content, ContentStream):
+ content = ContentStream(content, page)
+ clean(cast(ContentStream, content))
+ if isinstance(page["/Contents"], ArrayObject):
+ for o in cast(ArrayObject, page["/Contents"]):
+ self._objects[o.idnum - 1] = NullObject()
+ try:
+ self._objects[
+ cast(IndirectObject, page["/Contents"].indirect_reference).idnum - 1
+ ] = NullObject()
+ except AttributeError:
+ pass
+ page[NameObject("/Contents")] = self._add_object(content)
+
  def remove_images(self, ignore_byte_string_object: bool = False) -> None:
  """
  Remove images from this output.
 
  Args:
- ignore_byte_string_object: optional parameter
- to ignore ByteString Objects.
- """
- pg_dict = cast(DictionaryObject, self.get_object(self._pages))
- pages = cast(ArrayObject, pg_dict[PA.KIDS])
- jump_operators = (
- b"cm",
- b"w",
- b"J",
- b"j",
- b"M",
- b"d",
- b"ri",
- b"i",
- b"gs",
- b"W",
- b"b",
- b"s",
- b"S",
- b"f",
- b"F",
- b"n",
- b"m",
- b"l",
- b"c",
- b"v",
- b"y",
- b"h",
- b"B",
- b"Do",
- b"sh",
- )
- for page in pages:
- page_ref = cast(DictionaryObject, self.get_object(page))
- if "/Contents" not in page_ref:
- return
- content = page_ref["/Contents"].get_object()
- if not isinstance(content, ContentStream):
- content = ContentStream(content, page_ref)
-
- _operations = []
- seq_graphics = False
- for operands, operator in content.operations:
- if operator in [b"Tj", b"'"]:
- text = operands[0]
- if ignore_byte_string_object and not isinstance(
- text, TextStringObject
- ):
- operands[0] = TextStringObject()
- elif operator == b'"':
- text = operands[2]
- if ignore_byte_string_object and not isinstance(
- text, TextStringObject
- ):
- operands[2] = TextStringObject()
- elif operator == b"TJ":
- for i in range(len(operands[0])):
- if ignore_byte_string_object and not isinstance(
- operands[0][i], TextStringObject
- ):
- operands[0][i] = TextStringObject()
-
- if operator == b"q":
- seq_graphics = True
- if operator == b"Q":
- seq_graphics = False
- if seq_graphics and operator in jump_operators:
- continue
- if operator == b"re":
- continue
- _operations.append((operands, operator))
-
- content.operations = _operations
- page_ref.__setitem__(NameObject("/Contents"), content)
+ ignore_byte_string_object: obsolete
+ """
+ for page in self.pages:
+ self.remove_objects_from_page(page, ObjectDeletionFlag.IMAGES)
 
  def removeImages(self, ignoreByteStringObject: bool = False) -> None: # deprecated
  """
@@ -1906,44 +1978,10 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None:
  Remove text from this output.
 
  Args:
- ignore_byte_string_object: optional parameter
+ ignore_byte_string_object: obsolete
  """
- pg_dict = cast(DictionaryObject, self.get_object(self._pages))
- pages = cast(List[IndirectObject], pg_dict[PA.KIDS])
- for page in pages:
- page_ref = cast(PageObject, self.get_object(page))
- content = page_ref["/Contents"].get_object()
- if not isinstance(content, ContentStream):
- content = ContentStream(content, page_ref)
- for operands, operator in content.operations:
- if operator in [b"Tj", b"'"]:
- text = operands[0]
- if not ignore_byte_string_object:
- if isinstance(text, TextStringObject):
- operands[0] = TextStringObject()
- else:
- if isinstance(text, (TextStringObject, ByteStringObject)):
- operands[0] = TextStringObject()
- elif operator == b'"':
- text = operands[2]
- if not ignore_byte_string_object:
- if isinstance(text, TextStringObject):
- operands[2] = TextStringObject()
- else:
- if isinstance(text, (TextStringObject, ByteStringObject)):
- operands[2] = TextStringObject()
- elif operator == b"TJ":
- for i in range(len(operands[0])):
- if not ignore_byte_string_object:
- if isinstance(operands[0][i], TextStringObject):
- operands[0][i] = TextStringObject()
- else:
- if isinstance(
- operands[0][i], (TextStringObject, ByteStringObject)
- ):
- operands[0][i] = TextStringObject()
-
- page_ref.__setitem__(NameObject("/Contents"), content)
+ for page in self.pages:
+ self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT)
 
  def removeText(self, ignoreByteStringObject: bool = False) -> None: # deprecated
  """