Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add PdfReader._get_fonts #1083

Merged
merged 5 commits into from
Jul 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
Iterator,
List,
Optional,
Set,
Tuple,
Union,
cast,
Expand Down Expand Up @@ -1338,6 +1339,18 @@ def extractText(
deprecate_with_replacement("extractText", "extract_text")
return self.extract_text(Tj_sep=Tj_sep, TJ_sep=TJ_sep)

def _get_fonts(self) -> Tuple[Set[str], Set[str]]:
"""
Get the names of embedded fonts and unembedded fonts.

:return: (Set of embedded fonts, set of unembedded fonts)
"""
obj = self.get_object()
assert isinstance(obj, DictionaryObject)
fonts, embedded = _get_fonts_walk(cast(DictionaryObject, obj["/Resources"]))
unembedded = fonts - embedded
return embedded, unembedded

mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())
"""
A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
Expand Down Expand Up @@ -1486,3 +1499,35 @@ def __getitem__(self, index: int) -> PageObject:
def __iter__(self) -> Iterator[PageObject]:
for i in range(len(self)):
yield self[i]


def _get_fonts_walk(
obj: DictionaryObject,
fnt: Optional[Set[str]] = None,
emb: Optional[Set[str]] = None,
) -> Tuple[Set[str], Set[str]]:
"""
If there is a key called 'BaseFont', that is a font that is used in the document.
If there is a key called 'FontName' and another key in the same dictionary object
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
embedded.

We create and add to two sets, fnt = fonts used and emb = fonts embedded.
"""
if fnt is None:
fnt = set()
if emb is None:
emb = set()
if not hasattr(obj, "keys"):
return set(), set()
fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")
if "/BaseFont" in obj:
fnt.add(cast(str, obj["/BaseFont"]))
if "/FontName" in obj:
if [x for x in fontkeys if x in obj]: # test to see if there is FontFile
emb.add(cast(str, obj["/FontName"]))

for key in obj.keys():
_get_fonts_walk(cast(DictionaryObject, obj[key]), fnt, emb)

return fnt, emb # return the sets for each page
55 changes: 55 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,3 +266,58 @@ def test_extract_text_operator_t_star(): # L1266, L1267
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()


@pytest.mark.parametrize(
("pdf_path", "password", "embedded", "unembedded"),
[
(
os.path.join(RESOURCE_ROOT, "crazyones.pdf"),
None,
{
"/HHXGQB+SFTI1440",
"/TITXYI+SFRM0900",
"/YISQAD+SFTI1200",
},
set(),
),
(
os.path.join(RESOURCE_ROOT, "attachment.pdf"),
None,
{
"/HHXGQB+SFTI1440",
"/TITXYI+SFRM0900",
"/YISQAD+SFTI1200",
},
set(),
),
(
os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf"),
"openpassword",
{"/BAAAAA+DejaVuSans"},
set(),
),
(
os.path.join(RESOURCE_ROOT, "imagemagick-images.pdf"),
None,
set(),
{"/Helvetica"},
),
(os.path.join(RESOURCE_ROOT, "imagemagick-lzw.pdf"), None, set(), set()),
(
os.path.join(RESOURCE_ROOT, "reportlab-inline-image.pdf"),
None,
set(),
{"/Helvetica"},
),
],
)
def test_get_fonts(pdf_path, password, embedded, unembedded):
reader = PdfReader(pdf_path, password=password)
a = set()
b = set()
for page in reader.pages:
a_tmp, b_tmp = page._get_fonts()
a = a.union(a_tmp)
b = b.union(b_tmp)
assert (a, b) == (embedded, unembedded)