Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add PdfWriter.remove_objects_from_page(page: PageObject, to_delete: ObjectDeletionFlag) #1648

Merged
merged 35 commits into from
Feb 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
d27adda
ROB : Remove Text not working in all cases
pubpub-zz Feb 20, 2023
56e8b52
wipe text within XObjects
pubpub-zz Feb 20, 2023
52a6329
mypy
pubpub-zz Feb 20, 2023
3e2211b
fix remove_image
pubpub-zz Feb 21, 2023
aa22eda
pytest
pubpub-zz Feb 21, 2023
9d0d566
ROB + TST
pubpub-zz Feb 22, 2023
3649581
mypy
pubpub-zz Feb 22, 2023
f4b663c
rename func
pubpub-zz Feb 25, 2023
c42fa31
Merge remote-tracking branch 'py-pdf/main' into remove_text
pubpub-zz Feb 25, 2023
0928c8d
Merge remote-tracking branch 'py-pdf/main' into remove_text
pubpub-zz Feb 25, 2023
23b9146
Merge remote-tracking branch 'py-pdf/main' into remove_text
pubpub-zz Feb 25, 2023
3ed6653
Revert "Merge remote-tracking branch 'py-pdf/main' into remove_text"
pubpub-zz Feb 25, 2023
222c785
extensions in accordance with comments
pubpub-zz Feb 25, 2023
fe7fa62
fix
pubpub-zz Feb 25, 2023
c96c327
add test
pubpub-zz Feb 25, 2023
0b388b8
mypy
pubpub-zz Feb 25, 2023
10be747
fix loop
pubpub-zz Feb 25, 2023
49aae98
Merge remote-tracking branch 'py-pdf/main' into remove_text
pubpub-zz Feb 25, 2023
d30ba30
pytest
pubpub-zz Feb 25, 2023
2835c90
attempt
pubpub-zz Feb 26, 2023
b7095be
attempt 2
pubpub-zz Feb 26, 2023
4721c9b
Merge branch 'main' into remove_text
pubpub-zz Feb 26, 2023
a89cbca
Update pypdf/_writer.py
pubpub-zz Feb 26, 2023
3f47026
Update pypdf/_writer.py
pubpub-zz Feb 26, 2023
e8c2081
Update pypdf/_writer.py
pubpub-zz Feb 26, 2023
f0a357a
Update pypdf/_writer.py
pubpub-zz Feb 26, 2023
1414cc5
Update pypdf/_writer.py
pubpub-zz Feb 26, 2023
1ef07f8
Update pypdf/_writer.py
pubpub-zz Feb 26, 2023
393838f
Update pypdf/_writer.py
pubpub-zz Feb 26, 2023
663897e
Update pypdf/_writer.py
pubpub-zz Feb 26, 2023
4459f46
Update pypdf/_writer.py
pubpub-zz Feb 26, 2023
8129713
Update pypdf/_writer.py
pubpub-zz Feb 26, 2023
ab2e506
fix enum calls
pubpub-zz Feb 26, 2023
48899c7
pytest
pubpub-zz Feb 26, 2023
b70986a
mypy
pubpub-zz Feb 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pypdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from ._page import PageObject, Transformation
from ._reader import DocumentInformation, PdfFileReader, PdfReader
from ._version import __version__
from ._writer import PdfFileWriter, PdfWriter
from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter
from .pagerange import PageRange, parse_filename_page_ranges
from .papersizes import PaperSize

Expand All @@ -21,6 +21,7 @@
"PageRange",
"PaperSize",
"DocumentInformation",
"ObjectDeletionFlag",
"parse_filename_page_ranges",
"PdfFileMerger", # will be removed in pypdf==4.0.0; use PdfMerger instead
"PdfFileReader", # will be removed in pypdf==4.0.0; use PdfReader instead
Expand Down
272 changes: 155 additions & 117 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import codecs
import collections
import decimal
import enum
import logging
import random
import re
Expand Down Expand Up @@ -132,6 +133,15 @@
ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions((2**31 - 1) - 3)


class ObjectDeletionFlag(enum.IntFlag):
TEXT = enum.auto()
IMAGES = enum.auto()
LINKS = enum.auto()
ATTACHMENTS = enum.auto()
OBJECTS_3D = enum.auto()
ALL_ANNOTATIONS = enum.auto()


class PdfWriter:
"""
Write a PDF file out, given pages produced by another class.
Expand Down Expand Up @@ -1796,12 +1806,8 @@ def addNamedDestination(

def remove_links(self) -> None:
"""Remove links and annotations from this output."""
pg_dict = cast(DictionaryObject, self.get_object(self._pages))
pages = cast(ArrayObject, pg_dict[PA.KIDS])
for page in pages:
page_ref = cast(DictionaryObject, self.get_object(page))
if PG.ANNOTS in page_ref:
del page_ref[PG.ANNOTS]
for page in self.pages:
self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)

def removeLinks(self) -> None: # deprecated
"""
Expand All @@ -1812,85 +1818,151 @@ def removeLinks(self) -> None: # deprecated
deprecation_with_replacement("removeLinks", "remove_links", "3.0.0")
return self.remove_links()

def remove_annots(self, subtypes: Optional[Union[str, Iterable[str]]]) -> None:
"""
Remove annotations by Subtype
args:
subtypes : SubType or list of SubTypes to be removed. None=all
"""
for page in self.pages:
self._remove_annots_from_page(page, subtypes)

def _remove_annots_from_page(
self,
page: Union[IndirectObject, PageObject, DictionaryObject],
subtypes: Optional[Iterable[str]],
) -> None:
page = cast(DictionaryObject, page.get_object())
if PG.ANNOTS in page:
i = 0
while i < len(cast(ArrayObject, page[PG.ANNOTS])):
an = cast(ArrayObject, page[PG.ANNOTS])[i]
obj = cast(DictionaryObject, an.get_object())
if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:
if isinstance(an, IndirectObject):
self._objects[an.idnum - 1] = NullObject() # to reduce PDF size
del page[PG.ANNOTS][i] # type:ignore
else:
i += 1

def remove_objects_from_page(
self,
page: Union[PageObject, DictionaryObject],
to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should make our lives here easier and not allow the Iterables (tuple / list). It is a flag and thus people should use it.

If they don't know it, they will just do multiple calls (which is fine as well)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not see how to change keeping the code just affecting annotations/comments...

) -> None:
"""
Remove objects specified by `to_delete` from the given page.

Args:
page: Page object to clean up
to_delete: Objects to be deleted; can be a `ObjectDeletionFlag` or a list of ObjectDeletionFlag
"""
if isinstance(to_delete, (list, tuple)):
for to_d in to_delete:
self.remove_objects_from_page(page, to_d)
return
assert isinstance(to_delete, ObjectDeletionFlag)

if to_delete & ObjectDeletionFlag.LINKS:
return self._remove_annots_from_page(page, ("/Link",))
if to_delete & ObjectDeletionFlag.ATTACHMENTS:
return self._remove_annots_from_page(
page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")
)
if to_delete & ObjectDeletionFlag.OBJECTS_3D:
return self._remove_annots_from_page(page, ("/3D",))
if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:
return self._remove_annots_from_page(page, None)

if to_delete & ObjectDeletionFlag.IMAGES:
jump_operators = (
[b"w", b"J", b"j", b"M", b"d", b"i"]
+ [b"W", b"W*"]
+ [b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n"]
+ [b"m", b"l", b"c", b"v", b"y", b"h", b"re"]
+ [b"sh"]
)
else: # del text
jump_operators = [b"Tj", b"TJ", b"'", b'"']

images = []
forms = []

def clean(content: ContentStream) -> None:
nonlocal images, forms, to_delete
i = 0
while i < len(content.operations):
operands, operator = content.operations[i]
if operator in jump_operators:
del content.operations[i]
elif operator == b"Do":
if (
cast(ObjectDeletionFlag, to_delete) & ObjectDeletionFlag.IMAGES
and operands[0] in images
or cast(ObjectDeletionFlag, to_delete) & ObjectDeletionFlag.TEXT
and operands[0] in forms
):
del content.operations[i]
i += 1
else:
i += 1

try:
d = cast(dict, cast(DictionaryObject, page["/Resources"])["/XObject"])
except KeyError:
d = {}
for k, v in d.items():
o = v.get_object()
try:
content: Any = None
if to_delete & ObjectDeletionFlag.IMAGES and o["/Subtype"] == "/Image":
content = NullObject()
images.append(k)
if o["/Subtype"] == "/Form":
forms.append(k)
if isinstance(o, ContentStream):
content = o
else:
content = ContentStream(o, self)
content.update(o.items())
for k1 in ["/Length", "/Filter", "/DecodeParms"]:
try:
del content[k1]
except KeyError:
pass
clean(content)
if content is not None:
if isinstance(v, IndirectObject):
self._objects[v.idnum - 1] = content
else:
d[k] = self._add_object(content)
except (TypeError, KeyError):
pass
if "/Contents" in page:
content = page["/Contents"].get_object()
if not isinstance(content, ContentStream):
content = ContentStream(content, page)
clean(cast(ContentStream, content))
if isinstance(page["/Contents"], ArrayObject):
for o in cast(ArrayObject, page["/Contents"]):
self._objects[o.idnum - 1] = NullObject()
try:
self._objects[
cast(IndirectObject, page["/Contents"].indirect_reference).idnum - 1
] = NullObject()
except AttributeError:
pass
page[NameObject("/Contents")] = self._add_object(content)

def remove_images(self, ignore_byte_string_object: bool = False) -> None:
"""
Remove images from this output.

Args:
ignore_byte_string_object: optional parameter
to ignore ByteString Objects.
"""
pg_dict = cast(DictionaryObject, self.get_object(self._pages))
pages = cast(ArrayObject, pg_dict[PA.KIDS])
jump_operators = (
b"cm",
b"w",
b"J",
b"j",
b"M",
b"d",
b"ri",
b"i",
b"gs",
b"W",
b"b",
b"s",
b"S",
b"f",
b"F",
b"n",
b"m",
b"l",
b"c",
b"v",
b"y",
b"h",
b"B",
b"Do",
b"sh",
)
for page in pages:
page_ref = cast(DictionaryObject, self.get_object(page))
if "/Contents" not in page_ref:
return
content = page_ref["/Contents"].get_object()
if not isinstance(content, ContentStream):
content = ContentStream(content, page_ref)

_operations = []
seq_graphics = False
for operands, operator in content.operations:
if operator in [b"Tj", b"'"]:
text = operands[0]
if ignore_byte_string_object and not isinstance(
text, TextStringObject
):
operands[0] = TextStringObject()
elif operator == b'"':
text = operands[2]
if ignore_byte_string_object and not isinstance(
text, TextStringObject
):
operands[2] = TextStringObject()
elif operator == b"TJ":
for i in range(len(operands[0])):
if ignore_byte_string_object and not isinstance(
operands[0][i], TextStringObject
):
operands[0][i] = TextStringObject()

if operator == b"q":
seq_graphics = True
if operator == b"Q":
seq_graphics = False
if seq_graphics and operator in jump_operators:
continue
if operator == b"re":
continue
_operations.append((operands, operator))

content.operations = _operations
page_ref.__setitem__(NameObject("/Contents"), content)
ignore_byte_string_object: obsolete
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
"""
for page in self.pages:
self.remove_objects_from_page(page, ObjectDeletionFlag.IMAGES)

def removeImages(self, ignoreByteStringObject: bool = False) -> None: # deprecated
"""
Expand All @@ -1906,44 +1978,10 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None:
Remove text from this output.

Args:
ignore_byte_string_object: optional parameter
ignore_byte_string_object: obsolete
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
"""
pg_dict = cast(DictionaryObject, self.get_object(self._pages))
pages = cast(List[IndirectObject], pg_dict[PA.KIDS])
for page in pages:
page_ref = cast(PageObject, self.get_object(page))
content = page_ref["/Contents"].get_object()
if not isinstance(content, ContentStream):
content = ContentStream(content, page_ref)
for operands, operator in content.operations:
if operator in [b"Tj", b"'"]:
text = operands[0]
if not ignore_byte_string_object:
if isinstance(text, TextStringObject):
operands[0] = TextStringObject()
else:
if isinstance(text, (TextStringObject, ByteStringObject)):
operands[0] = TextStringObject()
elif operator == b'"':
text = operands[2]
if not ignore_byte_string_object:
if isinstance(text, TextStringObject):
operands[2] = TextStringObject()
else:
if isinstance(text, (TextStringObject, ByteStringObject)):
operands[2] = TextStringObject()
elif operator == b"TJ":
for i in range(len(operands[0])):
if not ignore_byte_string_object:
if isinstance(operands[0][i], TextStringObject):
operands[0][i] = TextStringObject()
else:
if isinstance(
operands[0][i], (TextStringObject, ByteStringObject)
):
operands[0][i] = TextStringObject()

page_ref.__setitem__(NameObject("/Contents"), content)
for page in self.pages:
self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT)

def removeText(self, ignoreByteStringObject: bool = False) -> None: # deprecated
"""
Expand Down
Loading