Skip to content

Commit

Permalink
ENH: Add reader.attachments public interface (#1661)
Browse files Browse the repository at this point in the history
Add `PdfReader.attachments -> Mapping[str, List[bytes]] as a public interface.

The heavy-lifting was done by @pubpub-zz in #1611 . This PR only adds the interface for the exiting functions.
  • Loading branch information
MartinThoma authored Feb 26, 2023
1 parent 67b085b commit d343445
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 5 deletions.
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ You can contribute to `pypdf on GitHub <https:/py-pdf/pypdf>`_.
user/metadata
user/extract-text
user/extract-images
user/extract-attachments
user/encryption-decryption
user/merging-pdfs
user/cropping-and-transforming
Expand Down
18 changes: 18 additions & 0 deletions docs/user/extract-attachments.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Extract Attachments

PDF documents can contain attachments. Attachments have a name, but it might not
be unique. For this reason, the value of `reader.attachments["attachment_name"]`
is a list.

You can extract all attachments like this:

```python
from pypdf import PdfReader

reader = PdfReader("example.pdf")

for name, content_list in reader.attachments:
for i, content in enumerate(content_list):
with open(f"{name}-{i}", "wb") as fp:
fp.write(content)
```
32 changes: 32 additions & 0 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@
Callable,
Dict,
Iterable,
Iterator,
List,
Mapping,
Optional,
Tuple,
Union,
Expand Down Expand Up @@ -2146,6 +2148,15 @@ def rename_form_topname(self, name: str) -> Optional[DictionaryObject]:
interim[NameObject("/T")] = TextStringObject(name)
return interim

@property
def attachments(self) -> Mapping[str, List[bytes]]:
return LazyDict(
{
name: (self._get_attachment_list, name)
for name in self._list_attachments()
}
)

def _list_attachments(self) -> List[str]:
"""
Retrieves the list of filenames of file attachments.
Expand All @@ -2172,6 +2183,12 @@ def _list_attachments(self) -> List[str]:
attachments_names.append(f)
return attachments_names

def _get_attachment_list(self, name: str) -> List[bytes]:
out = self._get_attachments(name)[name]
if isinstance(out, list):
return out
return [out]

def _get_attachments(
self, filename: Optional[str] = None
) -> Dict[str, Union[bytes, List[bytes]]]:
Expand Down Expand Up @@ -2220,6 +2237,21 @@ def _get_attachments(
return attachments


class LazyDict(Mapping):
def __init__(self, *args: Any, **kw: Any) -> None:
self._raw_dict = dict(*args, **kw)

def __getitem__(self, key: str) -> Any:
func, arg = self._raw_dict.__getitem__(key)
return func(arg)

def __iter__(self) -> Iterator[Any]:
return iter(self._raw_dict)

def __len__(self) -> int:
return len(self._raw_dict)


class PdfFileReader(PdfReader): # deprecated
def __init__(self, *args: Any, **kwargs: Any) -> None:
deprecation_with_replacement("PdfFileReader", "PdfReader", "3.0.0")
Expand Down
27 changes: 22 additions & 5 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,25 +1211,42 @@ def test_attachments():
b.seek(0)
reader = PdfReader(b)
b = None
assert reader.attachments == {}
assert reader._list_attachments() == []
assert reader._get_attachments() == {}
writer.add_attachment("foobar.txt", b"foobarcontent")
writer.add_attachment("foobar2.txt", b"foobarcontent2")
writer.add_attachment("foobar2.txt", b"2nd_foobarcontent")
to_add = [
("foobar.txt", b"foobarcontent"),
("foobar2.txt", b"foobarcontent2"),
("foobar2.txt", b"2nd_foobarcontent"),
]
for name, content in to_add:
writer.add_attachment(name, content)

b = BytesIO()
writer.write(b)
b.seek(0)
reader = PdfReader(b)
b = None
assert reader._list_attachments() == ["foobar.txt", "foobar2.txt", "foobar2.txt"]
assert sorted(reader.attachments.keys()) == sorted({name for name, _ in to_add})
assert reader._list_attachments() == [name for name, _ in to_add]

# We've added the same key twice - hence only 2 and not 3:
att = reader._get_attachments()
assert len(att) == 2
assert len(att) == 2 # we have 2 keys, but 3 attachments!

# The content for foobar.txt is clear and just a single value:
assert att["foobar.txt"] == b"foobarcontent"

# The content for foobar2.txt is a list!
att = reader._get_attachments("foobar2.txt")
assert len(att) == 1
assert att["foobar2.txt"] == [b"foobarcontent2", b"2nd_foobarcontent"]

# Let's do both cases with the public interface:
assert reader.attachments["foobar.txt"][0] == b"foobarcontent"
assert reader.attachments["foobar2.txt"][0] == b"foobarcontent2"
assert reader.attachments["foobar2.txt"][1] == b"2nd_foobarcontent"


@pytest.mark.external
def test_iss1614():
Expand Down

0 comments on commit d343445

Please sign in to comment.