Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add reader.attachments public interface #1661

Merged
merged 6 commits into from
Feb 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ You can contribute to `pypdf on GitHub <https:/py-pdf/pypdf>`_.
user/metadata
user/extract-text
user/extract-images
user/extract-attachments
user/encryption-decryption
user/merging-pdfs
user/cropping-and-transforming
Expand Down
18 changes: 18 additions & 0 deletions docs/user/extract-attachments.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Extract Attachments

PDF documents can contain attachments. Attachments have a name, but it might not
be unique. For this reason, the value of `reader.attachments["attachment_name"]`
is a list.

You can extract all attachments like this:

```python
from pypdf import PdfReader

reader = PdfReader("example.pdf")

for name, content_list in reader.attachments:
for i, content in enumerate(content_list):
with open(f"{name}-{i}", "wb") as fp:
fp.write(content)
```
32 changes: 32 additions & 0 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@
Callable,
Dict,
Iterable,
Iterator,
List,
Mapping,
Optional,
Tuple,
Union,
Expand Down Expand Up @@ -2146,6 +2148,15 @@ def rename_form_topname(self, name: str) -> Optional[DictionaryObject]:
interim[NameObject("/T")] = TextStringObject(name)
return interim

@property
def attachments(self) -> Mapping[str, List[bytes]]:
return LazyDict(
{
name: (self._get_attachment_list, name)
for name in self._list_attachments()
}
)

def _list_attachments(self) -> List[str]:
"""
Retrieves the list of filenames of file attachments.
Expand All @@ -2172,6 +2183,12 @@ def _list_attachments(self) -> List[str]:
attachments_names.append(f)
return attachments_names

def _get_attachment_list(self, name: str) -> List[bytes]:
out = self._get_attachments(name)[name]
if isinstance(out, list):
return out
return [out]

def _get_attachments(
self, filename: Optional[str] = None
) -> Dict[str, Union[bytes, List[bytes]]]:
Expand Down Expand Up @@ -2220,6 +2237,21 @@ def _get_attachments(
return attachments


class LazyDict(Mapping):
def __init__(self, *args: Any, **kw: Any) -> None:
self._raw_dict = dict(*args, **kw)

def __getitem__(self, key: str) -> Any:
func, arg = self._raw_dict.__getitem__(key)
return func(arg)

def __iter__(self) -> Iterator[Any]:
return iter(self._raw_dict)

def __len__(self) -> int:
return len(self._raw_dict)


class PdfFileReader(PdfReader): # deprecated
def __init__(self, *args: Any, **kwargs: Any) -> None:
deprecation_with_replacement("PdfFileReader", "PdfReader", "3.0.0")
Expand Down
27 changes: 22 additions & 5 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1203,25 +1203,42 @@ def test_attachments():
b.seek(0)
reader = PdfReader(b)
b = None
assert reader.attachments == {}
assert reader._list_attachments() == []
assert reader._get_attachments() == {}
writer.add_attachment("foobar.txt", b"foobarcontent")
writer.add_attachment("foobar2.txt", b"foobarcontent2")
writer.add_attachment("foobar2.txt", b"2nd_foobarcontent")
to_add = [
("foobar.txt", b"foobarcontent"),
("foobar2.txt", b"foobarcontent2"),
("foobar2.txt", b"2nd_foobarcontent"),
]
for name, content in to_add:
writer.add_attachment(name, content)

b = BytesIO()
writer.write(b)
b.seek(0)
reader = PdfReader(b)
b = None
assert reader._list_attachments() == ["foobar.txt", "foobar2.txt", "foobar2.txt"]
assert sorted(reader.attachments.keys()) == sorted({name for name, _ in to_add})
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
assert reader._list_attachments() == [name for name, _ in to_add]

# We've added the same key twice - hence only 2 and not 3:
att = reader._get_attachments()
assert len(att) == 2
assert len(att) == 2 # we have 2 keys, but 3 attachments!

# The content for foobar.txt is clear and just a single value:
assert att["foobar.txt"] == b"foobarcontent"

# The content for foobar2.txt is a list!
att = reader._get_attachments("foobar2.txt")
assert len(att) == 1
assert att["foobar2.txt"] == [b"foobarcontent2", b"2nd_foobarcontent"]

# Let's do both cases with the public interface:
assert reader.attachments["foobar.txt"][0] == b"foobarcontent"
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
assert reader.attachments["foobar2.txt"][0] == b"foobarcontent2"
assert reader.attachments["foobar2.txt"][1] == b"2nd_foobarcontent"


@pytest.mark.external
def test_iss1614():
Expand Down