From 5400f5ada933b7b4f54ba9eb7c9db53729ef551c Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 24 Sep 2024 13:07:59 +0900 Subject: [PATCH 1/9] BUG: Missing spaces in extract_text() method (#1328) --- pypdf/_page.py | 2 +- tests/test_text_extraction.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index e4ec053c8..8e9dbc21e 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1989,7 +1989,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) if isinstance(op, (int, float, NumberObject, FloatObject)) and ( - (abs(float(op)) >= _space_width) + (math.ceil(abs(float(op))) >= _space_width) and (len(text) > 0) and (text[-1] != " ") ): diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 2f0eaad1d..faef6d980 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -189,3 +189,20 @@ def test_layout_mode_warnings(mock_logger_warning): mock_logger_warning.assert_called_with( "Argument visitor_text is ignored in layout mode", "pypdf._page" ) + + +@pytest.mark.enable_socket() +def test_space_with_one_unit_smaller_than_font_width(): + """Tests for #1328""" + url = "https://github.com/py-pdf/PyPDF2/files/9498481/0004.pdf" + name = "iss1328.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + page = reader.pages[0] + extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip() + assert """Reporting crude oil leak. +Leak was isolated to well +pad. Segment of line was +immediately isolated, now +estimated at 5 barrels of oil +spilt. Root cause still +unknown at this time.""" == extracted From aac04364611818571fc24a53f36e325849f0371a Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 24 Sep 2024 13:42:47 +0900 Subject: [PATCH 2/9] Revert "BUG: Missing spaces in extract_text() method (#1328)" This reverts commit 5400f5ada933b7b4f54ba9eb7c9db53729ef551c. --- pypdf/_page.py | 2 +- tests/test_text_extraction.py | 17 ----------------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 8e9dbc21e..e4ec053c8 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1989,7 +1989,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) if isinstance(op, (int, float, NumberObject, FloatObject)) and ( - (math.ceil(abs(float(op))) >= _space_width) + (abs(float(op)) >= _space_width) and (len(text) > 0) and (text[-1] != " ") ): diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index faef6d980..2f0eaad1d 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -189,20 +189,3 @@ def test_layout_mode_warnings(mock_logger_warning): mock_logger_warning.assert_called_with( "Argument visitor_text is ignored in layout mode", "pypdf._page" ) - - -@pytest.mark.enable_socket() -def test_space_with_one_unit_smaller_than_font_width(): - """Tests for #1328""" - url = "https://github.com/py-pdf/PyPDF2/files/9498481/0004.pdf" - name = "iss1328.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - page = reader.pages[0] - extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip() - assert """Reporting crude oil leak. -Leak was isolated to well -pad. Segment of line was -immediately isolated, now -estimated at 5 barrels of oil -spilt. Root cause still -unknown at this time.""" == extracted From 64b1c92abec2d72d90086c0a074c3712ff86249d Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 24 Sep 2024 13:44:42 +0900 Subject: [PATCH 3/9] BUG: Missing spaces in extract_text() method (#1328) --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index e4ec053c8..8e9dbc21e 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1989,7 +1989,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) if isinstance(op, (int, float, NumberObject, FloatObject)) and ( - (abs(float(op)) >= _space_width) + (math.ceil(abs(float(op))) >= _space_width) and (len(text) > 0) and (text[-1] != " ") ): From 70e9b386a409b7dbe559fc1db4759eb866746d82 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 24 Sep 2024 13:48:01 +0900 Subject: [PATCH 4/9] BUG: Missing spaces in extract_text() method (#1328) add test --- tests/test_text_extraction.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 2f0eaad1d..93082349a 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -189,3 +189,20 @@ def test_layout_mode_warnings(mock_logger_warning): mock_logger_warning.assert_called_with( "Argument visitor_text is ignored in layout mode", "pypdf._page" ) + + +@pytest.mark.enable_socket() +def test_space_with_one_unit_smaller_than_font_width(): + """Tests for #1328""" + url = "https://github.com/py-pdf/PyPDF2/files/9498481/0004.pdf" + name = "iss1328.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + page = reader.pages[0] + extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip() + assert extracted == """Reporting crude oil leak. +Leak was isolated to well +pad. Segment of line was +immediately isolated, now +estimated at 5 barrels of oil +spilt. Root cause still +unknown at this time.""" From 65224e1f2dc85da56beedddd25179f22ccacf6be Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 24 Sep 2024 13:42:47 +0900 Subject: [PATCH 5/9] Revert "BUG: Missing spaces in extract_text() method (#1328)" This reverts commit 5400f5ada933b7b4f54ba9eb7c9db53729ef551c. BUG: Missing spaces in extract_text() method (#1328) BUG: Missing spaces in extract_text() method (#1328) add test --- tests/test_text_extraction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index faef6d980..93082349a 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -199,10 +199,10 @@ def test_space_with_one_unit_smaller_than_font_width(): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip() - assert """Reporting crude oil leak. + assert extracted == """Reporting crude oil leak. Leak was isolated to well pad. Segment of line was immediately isolated, now estimated at 5 barrels of oil spilt. Root cause still -unknown at this time.""" == extracted +unknown at this time.""" From f6dcb439a3e667802558440d79e6b388307a6fed Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 24 Sep 2024 14:26:45 +0900 Subject: [PATCH 6/9] BUG: Missing spaces in extract_text() method (#1328) Convert font size comparison to ratio --- pypdf/_page.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 8e9dbc21e..ff4ef30c4 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1988,8 +1988,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: for op in operands[0]: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) + # The space width may be smaller than the font width, so the width should be 95%. if isinstance(op, (int, float, NumberObject, FloatObject)) and ( - (math.ceil(abs(float(op))) >= _space_width) + (abs(float(op) / 0.95) >= _space_width) and (len(text) > 0) and (text[-1] != " ") ): From fd1c48930683bafe4f656b1fe742827a29d4feae Mon Sep 17 00:00:00 2001 From: Ryo Kamei <44630192+ssjkamei@users.noreply.github.com> Date: Tue, 24 Sep 2024 18:39:31 +0900 Subject: [PATCH 7/9] Correction to new file URL. Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- tests/test_text_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 93082349a..08c4fc4f1 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -194,7 +194,7 @@ def test_layout_mode_warnings(mock_logger_warning): @pytest.mark.enable_socket() def test_space_with_one_unit_smaller_than_font_width(): """Tests for #1328""" - url = "https://github.com/py-pdf/PyPDF2/files/9498481/0004.pdf" + url = "https://github.com/py-pdf/pypdf/files/9498481/0004.pdf" name = "iss1328.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] From 2873b9eb9130a6529cc4e7003e37c3655bc3a7b6 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 24 Sep 2024 18:55:45 +0900 Subject: [PATCH 8/9] BUG: Missing spaces in extract_text() method (py-pdf#1328) calculation efficiency --- pypdf/_page.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index ff4ef30c4..87b914ce2 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1985,12 +1985,13 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: process_operation(b"TL", [-operands[1]]) process_operation(b"Td", operands) elif operator == b"TJ": + # The space width may be smaller than the font width, so the width should be 95%. + _confirm_space_width = _space_width * 0.95 for op in operands[0]: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) - # The space width may be smaller than the font width, so the width should be 95%. if isinstance(op, (int, float, NumberObject, FloatObject)) and ( - (abs(float(op) / 0.95) >= _space_width) + (abs(float(op)) >= _confirm_space_width) and (len(text) > 0) and (text[-1] != " ") ): From 7597704eaa466934690853b370c850e2f6253e09 Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Tue, 24 Sep 2024 21:14:02 +0900 Subject: [PATCH 9/9] BUG: Missing spaces in extract_text() method (py-pdf#1328) Simplify the assertion process --- tests/test_text_extraction.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 08c4fc4f1..8bfa1809e 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -198,11 +198,5 @@ def test_space_with_one_unit_smaller_than_font_width(): name = "iss1328.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] - extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip() - assert extracted == """Reporting crude oil leak. -Leak was isolated to well -pad. Segment of line was -immediately isolated, now -estimated at 5 barrels of oil -spilt. Root cause still -unknown at this time.""" + extracted = page.extract_text() + assert "Reporting crude oil leak.\n" in extracted