Skip to content

Commit

Permalink
ENH: Add possibility to get names of fonts
Browse files Browse the repository at this point in the history
See #153

Co-authored-by: tiarno <[email protected]>
  • Loading branch information
MartinThoma and tiarno committed Jul 9, 2022
1 parent b42e0db commit d3bf26a
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 0 deletions.
56 changes: 56 additions & 0 deletions PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
Iterator,
List,
Optional,
Set,
Tuple,
Union,
cast,
Expand Down Expand Up @@ -1338,6 +1339,35 @@ def extractText(
deprecate_with_replacement("extractText", "extract_text")
return self.extract_text(Tj_sep=Tj_sep, TJ_sep=TJ_sep)

def _get_fonts(self) -> Tuple[Set[str], Set[str]]:
"""
Get the names of embedded fonts and unembedded fonts.
:return: (Set of embedded fonts, set of unembedded fonts)
"""
obj = self.get_object()
assert obj is not None

fonts: Set[str] = set()
embedded: Set[str] = set()
if isinstance(obj, ArrayObject):
embedded_fonts: Set[str] = set()
for i in obj:
if hasattr(i, "keys"):
f, e = _get_fonts_walk(i, fonts, embedded_fonts)
if f is not None:
fonts = fonts.union(f)
if e is not None:
embedded = embedded.union(e)
elif isinstance(obj, DictionaryObject):
f, e = _get_fonts_walk(obj["/Resources"], fonts, embedded)
if f is not None:
fonts = fonts.union(f)
if e is not None:
embedded = embedded.union(e)
unembedded = fonts - embedded
return embedded, unembedded

mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())
"""
A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
Expand Down Expand Up @@ -1486,3 +1516,29 @@ def __getitem__(self, index: int) -> PageObject:
def __iter__(self) -> Iterator[PageObject]:
for i in range(len(self)):
yield self[i]


def _get_fonts_walk(
obj: DictionaryObject, fnt: Set[str], emb: Set[str]
) -> Tuple[Optional[Set[str]], Optional[Set[str]]]:
"""
If there is a key called 'BaseFont', that is a font that is used in the document.
If there is a key called 'FontName' and another key in the same dictionary object
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
embedded.
We create and add to two sets, fnt = fonts used and emb = fonts embedded.
"""
if not hasattr(obj, "keys"):
return None, None
fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")
if "/BaseFont" in obj:
fnt.add(obj["/BaseFont"])
if "/FontName" in obj:
if [x for x in fontkeys if x in obj]: # test to see if there is FontFile
emb.add(obj["/FontName"])

for key in obj.keys():
_get_fonts_walk(obj[key], fnt, emb)

return fnt, emb # return the sets for each page
55 changes: 55 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,3 +266,58 @@ def test_extract_text_operator_t_star(): # L1266, L1267
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()


@pytest.mark.parametrize(
("pdf_path", "password", "embedded", "unembedded"),
[
(
os.path.join(RESOURCE_ROOT, "crazyones.pdf"),
None,
{
"/HHXGQB+SFTI1440",
"/TITXYI+SFRM0900",
"/YISQAD+SFTI1200",
},
set(),
),
(
os.path.join(RESOURCE_ROOT, "attachment.pdf"),
None,
{
"/HHXGQB+SFTI1440",
"/TITXYI+SFRM0900",
"/YISQAD+SFTI1200",
},
set(),
),
(
os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf"),
"openpassword",
{"/BAAAAA+DejaVuSans"},
set(),
),
(
os.path.join(RESOURCE_ROOT, "imagemagick-images.pdf"),
None,
set(),
{"/Helvetica"},
),
(os.path.join(RESOURCE_ROOT, "imagemagick-lzw.pdf"), None, set(), set()),
(
os.path.join(RESOURCE_ROOT, "reportlab-inline-image.pdf"),
None,
set(),
{"/Helvetica"},
),
],
)
def test_get_fonts(pdf_path, password, embedded, unembedded):
reader = PdfReader(pdf_path, password=password)
a = set()
b = set()
for page in reader.pages:
a_tmp, b_tmp = page._get_fonts()
a = a.union(a_tmp)
b = b.union(b_tmp)
assert (a, b) == (embedded, unembedded)

0 comments on commit d3bf26a

Please sign in to comment.