From 635a7c16c829acab942c7ded0ea43fdfb451ad4c Mon Sep 17 00:00:00 2001 From: Ryo Kamei <44630192+ssjkamei@users.noreply.github.com> Date: Wed, 25 Sep 2024 02:42:50 +0900 Subject: [PATCH] BUG: Missing spaces in extract_text() method (#1328) (#2868) * BUG: Missing spaces in extract_text() method (#1328) * Revert "BUG: Missing spaces in extract_text() method (#1328)" This reverts commit 5400f5ada933b7b4f54ba9eb7c9db53729ef551c. * BUG: Missing spaces in extract_text() method (#1328) * BUG: Missing spaces in extract_text() method (#1328) add test * Revert "BUG: Missing spaces in extract_text() method (#1328)" This reverts commit 5400f5ada933b7b4f54ba9eb7c9db53729ef551c. BUG: Missing spaces in extract_text() method (#1328) BUG: Missing spaces in extract_text() method (#1328) add test * BUG: Missing spaces in extract_text() method (#1328) Convert font size comparison to ratio * Correction to new file URL. Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> * BUG: Missing spaces in extract_text() method (py-pdf#1328) calculation efficiency * BUG: Missing spaces in extract_text() method (py-pdf#1328) Simplify the assertion process --------- Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_page.py | 4 +++- tests/test_text_extraction.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index e4ec053c8..87b914ce2 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1985,11 +1985,13 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: process_operation(b"TL", [-operands[1]]) process_operation(b"Td", operands) elif operator == b"TJ": + # The space width may be smaller than the font width, so the width should be 95%. + _confirm_space_width = _space_width * 0.95 for op in operands[0]: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) if isinstance(op, (int, float, NumberObject, FloatObject)) and ( - (abs(float(op)) >= _space_width) + (abs(float(op)) >= _confirm_space_width) and (len(text) > 0) and (text[-1] != " ") ): diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 2f0eaad1d..8bfa1809e 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -189,3 +189,14 @@ def test_layout_mode_warnings(mock_logger_warning): mock_logger_warning.assert_called_with( "Argument visitor_text is ignored in layout mode", "pypdf._page" ) + + +@pytest.mark.enable_socket() +def test_space_with_one_unit_smaller_than_font_width(): + """Tests for #1328""" + url = "https://github.com/py-pdf/pypdf/files/9498481/0004.pdf" + name = "iss1328.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + page = reader.pages[0] + extracted = page.extract_text() + assert "Reporting crude oil leak.\n" in extracted