Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add "layout" mode for text extraction #2388

Merged
merged 41 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
86ed974
ENH: text extraction "layout" mode
shartzog Jan 3, 2024
f43b84e
BUG: bad refactor in _layout_mode/_fonts.py
shartzog Jan 3, 2024
220de15
STY: Address ruff issues
shartzog Jan 3, 2024
21d9f1b
STY: Address additional ruff issues
shartzog Jan 3, 2024
9fa3b5f
STY: final ruff fixes?
shartzog Jan 3, 2024
1545a27
STY: final final ruff issue?
shartzog Jan 3, 2024
81b6a83
STY: installed ruff and used --fix
shartzog Jan 3, 2024
bb9190b
STY: Address mypy errors
shartzog Jan 3, 2024
cefbfc6
Merge branch 'py-pdf:main' into text-layout-mode
shartzog Jan 4, 2024
ff7e40f
MAINT: Address PR review comments
shartzog Jan 4, 2024
f37909c
DOC: remove unnecessary "Methods" area
shartzog Jan 4, 2024
48e971e
TST: fp.read() encoding fix
shartzog Jan 4, 2024
8742bcc
MAINT: Address review comments
shartzog Jan 5, 2024
8e9d879
STY: ruff import order error
shartzog Jan 5, 2024
4dc3250
TST: space_vertically
shartzog Jan 5, 2024
d1d85a0
TST: missed rstrip()
shartzog Jan 6, 2024
70b2f31
Improve line coverage
MartinThoma Jan 6, 2024
e7d5edd
test to_dict
MartinThoma Jan 6, 2024
64d1df0
Add test with form
MartinThoma Jan 6, 2024
377bbd1
ENH: TJ spacing and rotation handling
shartzog Jan 7, 2024
3a0fc89
DOC: cleanup docstrings and Font dataclass
shartzog Jan 8, 2024
fe7bb69
DOC: missed dataclass attribute
shartzog Jan 8, 2024
cec0be3
Merge branch 'text-layout-mode' into main
shartzog Jan 8, 2024
955bd38
Merge pull request #1 from shartzog/main
shartzog Jan 8, 2024
579692a
BUG: address bugs caused by rename/refactor
shartzog Jan 8, 2024
4402caa
Merge branch 'text-layout-mode' of https:/shartzog/pypdf …
shartzog Jan 8, 2024
41417eb
Fix ruff/mypy
shartzog Jan 8, 2024
778f3c7
Fix minor alignment change in multicolumn test
shartzog Jan 8, 2024
8279c79
Epic Page test for Font coverage
shartzog Jan 8, 2024
75aec12
Tests Bug Fixes "Uncommon" Operators
shartzog Jan 8, 2024
f25e9d5
Typing / Style Corrections
shartzog Jan 8, 2024
744a6db
Font refactoring/tests
shartzog Jan 9, 2024
c5f0cd8
utf-8 instead of utf8
MartinThoma Jan 9, 2024
1b65085
oops
MartinThoma Jan 9, 2024
373025d
Fix sphinx build warning
MartinThoma Jan 9, 2024
cdaa9ca
Run pre-commit
MartinThoma Jan 9, 2024
64e4c83
Run pre-commit
MartinThoma Jan 9, 2024
878e407
Improvements using pathlib
MartinThoma Jan 9, 2024
e9962b3
Use splitlines
MartinThoma Jan 9, 2024
06e79d3
Use Optional
MartinThoma Jan 9, 2024
f43201a
Move STANDARD_WIDTHS to own file
MartinThoma Jan 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion docs/user/extract-text.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ page = reader.pages[0]
print(page.extract_text())
```

you can also choose to limit the text orientation you want to extract, e.g:
You can also choose to limit the text orientation you want to extract, e.g:

```python
# extract only text oriented up
Expand All @@ -20,6 +20,24 @@ print(page.extract_text(0))
print(page.extract_text((0, 90)))
```

You can also extract text in "layout" mode:

```python
# extract text in a fixed width format that closely adheres to the rendered
# layout in the source pdf
print(page.extract_text(extraction_mode="layout"))

# extract text preserving horizontal positioning without excess vertical
# whitespace (removes blank and "whitespace only" lines)
print(page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False))

# adjust horizontal spacing
print(page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0))

# exclude (default) or include (as shown below) text rotated w.r.t. the page
print(page.extract_text(extraction_mode="layout", layout_mode_strip_rotated=False))
```

Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extract_text) for more details.

## Using a visitor
Expand Down
2 changes: 1 addition & 1 deletion pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def build_char_map_from_dict(

# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
_default_fonts_space_width: Dict[str, int] = {
"/Courrier": 600,
"/Courier": 600,
"/Courier-Bold": 600,
"/Courier-BoldOblique": 600,
"/Courier-Oblique": 600,
Expand Down
123 changes: 123 additions & 0 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@

import math
import re
import sys
from decimal import Decimal
from pathlib import Path
from typing import (
Any,
Callable,
Expand All @@ -50,6 +52,7 @@
from ._protocols import PdfReaderProtocol, PdfWriterProtocol
from ._text_extraction import (
OrientationNotFoundError,
_layout_mode,
crlf_space_check,
handle_tj,
mult,
Expand Down Expand Up @@ -83,6 +86,12 @@
StreamObject,
)

if sys.version_info >= (3, 8):
from typing import Literal
else:
from typing_extensions import Literal


MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox'


Expand Down Expand Up @@ -1868,6 +1877,87 @@
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
return output

def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]:
"""
Get fonts formatted for "layout" mode text extraction.

Returns:
Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name
"""
# Font retrieval logic adapted from pypdf.PageObject._extract_text()
objr: Any = self
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
while NameObject(PG.RESOURCES) not in objr:
objr = objr["/Parent"].get_object()

Check warning on line 1890 in pypdf/_page.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_page.py#L1890

Added line #L1890 was not covered by tests
resources_dict: Any = objr[PG.RESOURCES]
fonts: Dict[str, _layout_mode.Font] = {}
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
if "/Font" in resources_dict and self.pdf is not None:
for font_name in resources_dict["/Font"]:
*cmap, font_dict_obj = build_char_map(font_name, 200.0, self)
font_dict = {
k: self.pdf.get_object(v)
if isinstance(v, IndirectObject)
else [
self.pdf.get_object(_v) if isinstance(_v, IndirectObject) else _v
for _v in v
]
if isinstance(v, ArrayObject)
else v
for k, v in font_dict_obj.items()
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for me fonts are only returning DictionaryObject (most of the time through IndirectObject) in such case get_object() will always return the actual object. It seems also consistant with further use where I can not find ArrayObjects
in such case I would propose:

Suggested change
font_dict = {
k: self.pdf.get_object(v)
if isinstance(v, IndirectObject)
else [
self.pdf.get_object(_v) if isinstance(_v, IndirectObject) else _v
for _v in v
]
if isinstance(v, ArrayObject)
else v
for k, v in font_dict_obj.items()
}
font_dict = {
k: v.get_object() for k, v in font_dict_obj.items()
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, the nasty comprehension seems to be necessary. Here's an example of a PDF that requires the alternate ArrayObject interpretation.
Epic Page.PDF

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shartzog
I'm sorry, I do not see the issue:

import pypdf
r = pypdf.PdfReader("Epic Page.pdf")
for k,v in r.pages[0]["/Resources"]["/Font"].items():
    print(k,type(v),type(v.get_object()))

returns:
/F0 <class 'pypdf.generic._base.IndirectObject'> <class 'pypdf.generic._data_structures.DictionaryObject'>
/F1 <class 'pypdf.generic._base.IndirectObject'> <class 'pypdf.generic._data_structures.DictionaryObject'>
/F2 <class 'pypdf.generic._base.IndirectObject'> <class 'pypdf.generic._data_structures.DictionaryObject'>
/F3 <class 'pypdf.generic._base.IndirectObject'> <class 'pypdf.generic._data_structures.DictionaryObject'>
/F4 <class 'pypdf.generic._base.IndirectObject'> <class 'pypdf.generic._data_structures.DictionaryObject'>
/F5 <class 'pypdf.generic._base.IndirectObject'> <class 'pypdf.generic._data_structures.DictionaryObject'>
/F6 <class 'pypdf.generic._base.IndirectObject'> <class 'pypdf.generic._data_structures.DictionaryObject'>
/F7 <class 'pypdf.generic._base.IndirectObject'> <class 'pypdf.generic._data_structures.DictionaryObject'>
/F8 <class 'pypdf.generic._base.IndirectObject'> <class 'pypdf.generic._data_structures.DictionaryObject'>
/F9 <class 'pypdf.generic._base.IndirectObject'> <class 'pypdf.generic._data_structures.DictionaryObject'>

I can not see the arrayobject 😐. Can you help me find the hickup?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Huh... I added a temporary print wrapper like so and got output from it...

def _print_wrapper(obj):  # just stuck this at the top of _page.py for a moment
    print(f"{obj}")
    return obj
    .
    .
    .
                font_dict = {
                    k: self.pdf.get_object(v)
                    if isinstance(v, IndirectObject)
                    else [
                        _print_wrapper(self.pdf.get_object(_v) if isinstance(_v, IndirectObject) else _v)
                        for _v in v
                    ]
                    if isinstance(v, ArrayObject)
                    else v
                    for k, v in font_dict_obj.items()
                }
    .
    .
    .

Maybe I need to retest with the latest main??

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let you know how further testing turns out. 👍

# mypy really sucks at unpacking
fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type]
return fonts

def _layout_mode_text(
self,
space_vertically: bool = True,
scale_weight: float = 1.25,
strip_rotated: bool = True,
debug_path: Union[Path, None] = None,
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
) -> str:
"""
Get text preserving fidelity to source PDF text layout.

Args:
space_vertically: include blank lines inferred from y distance + font
height. Defaults to True.
scale_weight: multiplier for string length when calculating weighted
average character width. Defaults to 1.25.
strip_rotated: Removes text that is rotated w.r.t. to the page from
layout mode output. Defaults to True.
debug_path (Path | None): if supplied, must target a directory.
creates the following files with debug information for layout mode
functions if supplied:
- fonts.json: output of self._layout_mode_fonts
- tjs.json: individual text render ops with corresponding transform matrices
- bts.json: text render ops left justified and grouped by BT/ET operators
- bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
Defaults to None.

Returns:
str: multiline string containing page text in a fixed width format that
closely adheres to the rendered layout in the source pdf.
"""
fonts = self._layout_mode_fonts()
if debug_path: # pragma: no cover
import json
debug_path.joinpath("fonts.json").write_text(
json.dumps(fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)),
"utf-8",
)

ops = iter(ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations)
bt_groups = _layout_mode.text_show_operations(ops, fonts, strip_rotated, debug_path)

if not bt_groups:
return ""
shartzog marked this conversation as resolved.
Show resolved Hide resolved

ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)

char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)

return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically)

def extract_text(
self,
*args: Any,
Expand All @@ -1876,6 +1966,8 @@
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
extraction_mode: Literal["plain", "layout"] = "plain",
**kwargs: Any,
) -> str:
"""
Locate all text drawing commands, in the order they are provided in the
Expand Down Expand Up @@ -1913,10 +2005,41 @@
text matrix, font-dictionary and font-size.
The font-dictionary may be None in case of unknown fonts.
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,
"layout" for experimental layout mode functionality.
NOTE: orientations, space_width, and visitor_* parameters are NOT respected
in "layout" mode.

KwArgs:
layout_mode_space_vertically (bool): include blank lines inferred from
y distance + font height. Defaults to True.
layout_mode_scale_weight (float): multiplier for string length when calculating
weighted average character width. Defaults to 1.25.
layout_mode_strip_rotated (bool): layout mode does not support rotated text.
Set to False to include rotated text anyway. If rotated text is discovered,
layout will be degraded and a warning will result. Defaults to True.
layout_mode_strip_rotated: Removes text that is rotated w.r.t. to the page from
layout mode output. Defaults to True.
layout_mode_debug_path (Path | None): if supplied, must target a directory.
creates the following files with debug information for layout mode
functions if supplied:
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
- fonts.json: output of self._layout_mode_fonts
- tjs.json: individual text render ops with corresponding transform matrices
- bts.json: text render ops left justified and grouped by BT/ET operators
- bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

Returns:
The extracted text
"""
if extraction_mode not in ["plain", "layout"]:
raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
if extraction_mode == "layout":
return self._layout_mode_text(
space_vertically=kwargs.get("layout_mode_space_vertically", True),
scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
debug_path=kwargs.get("layout_mode_debug_path", None)
)
if len(args) >= 1:
if isinstance(args[0], str):
if len(args) >= 3:
Expand Down
2 changes: 1 addition & 1 deletion pypdf/_text_extraction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
CUSTOM_RTL_MIN: int = -1
CUSTOM_RTL_MAX: int = -1
CUSTOM_RTL_SPECIAL_CHARS: List[int] = []

LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5

class OrientationNotFoundError(Exception):
pass
Expand Down
5 changes: 5 additions & 0 deletions pypdf/_text_extraction/_layout_mode/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Layout mode text extraction extension for pypdf"""
from ._fixed_width_page import fixed_char_width, fixed_width_page, text_show_operations, y_coordinate_groups
from ._font import Font

__all__ = ["fixed_char_width", "fixed_width_page", "text_show_operations", "y_coordinate_groups", "Font"]
Loading
Loading