From d7b286ec7d74882354298f752f092ab9e45164a0 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 2 Apr 2024 13:06:47 +0200 Subject: [PATCH 01/10] BUG: fix fields update where annots are kids of field closes #2234 closes #2512 replaces #2333 --- pypdf/_writer.py | 137 +++++++++++++++--------------- pypdf/generic/_data_structures.py | 24 ++++++ tests/test_writer.py | 82 ++++++++++++++++++ 3 files changed, 176 insertions(+), 67 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index c7569e31e..17f309664 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -747,13 +747,24 @@ def append_pages_from_reader( if callable(after_page_append): after_page_append(writer_page) - def _update_text_field(self, field: DictionaryObject) -> None: + def _update_field_annotation( + self, field: DictionaryObject, anno: DictionaryObject + ) -> None: # Calculate rectangle dimensions - _rct = cast(RectangleObject, field[AA.Rect]) + _rct = cast(RectangleObject, anno[AA.Rect]) rct = RectangleObject((0, 0, _rct[2] - _rct[0], _rct[3] - _rct[1])) # Extract font information - da = cast(str, field[AA.DA]) + da = anno.get_herited( + AA.DA, + cast(DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]).get( + AA.DA, None + ), + ) + if da is None: + da = TextStringObject("/Helv 0 Tf 0 g") + else: + da = da.get_object() font_properties = da.replace("\n", " ").replace("\r", " ").split(" ") font_properties = [x for x in font_properties if x != ""] font_name = font_properties[font_properties.index("Tf") - 2] @@ -767,19 +778,27 @@ def _update_text_field(self, field: DictionaryObject) -> None: # Retrieve font information from local DR ... dr: Any = cast( DictionaryObject, - cast(DictionaryObject, field.get("/DR", DictionaryObject())).get_object(), + cast( + DictionaryObject, + anno.get_herited( + "/DR", + cast( + DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM] + ).get("/DR", DictionaryObject()), + ), + ).get_object(), ) dr = dr.get("/Font", DictionaryObject()).get_object() if font_name not in dr: # ...or AcroForm dictionary dr = cast( Dict[Any, Any], - cast(DictionaryObject, self._root_object["/AcroForm"]).get("/DR", {}), + cast( + DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM] + ).get("/DR", {}), ) - if isinstance(dr, IndirectObject): # pragma: no cover - dr = dr.get_object() - dr = dr.get("/Font", DictionaryObject()).get_object() - font_res = dr.get(font_name) + dr = dr.get_object().get("/Font", DictionaryObject()).get_object() + font_res = dr.get(font_name, None) if font_res is not None: font_res = cast(DictionaryObject, font_res.get_object()) font_subtype, _, font_encoding, font_map = build_char_map_from_dict( @@ -806,7 +825,7 @@ def _update_text_field(self, field: DictionaryObject) -> None: # Retrieve field text and selected values field_flags = field.get(FA.Ff, 0) if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: - txt = "\n".join(field.get(FA.Opt, {})) + txt = "\n".join(anno.get_herited(FA.Opt, {})) sel = field.get("/V", []) if not isinstance(sel, list): sel = [sel] @@ -822,7 +841,7 @@ def _update_text_field(self, field: DictionaryObject) -> None: # may be improved but can not find how get fill working => replaced with lined box ap_stream += ( f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n" - f"0.5 0.5 0.5 rg s\n{field[AA.DA]}\n" + f"0.5 0.5 0.5 rg s\n{da}\n" ).encode() if line_number == 0: ap_stream += f"2 {y_offset} Td\n".encode() @@ -862,16 +881,16 @@ def _update_text_field(self, field: DictionaryObject) -> None: ) } ) - if AA.AP not in field: - field[NameObject(AA.AP)] = DictionaryObject( + if AA.AP not in anno: + anno[NameObject(AA.AP)] = DictionaryObject( {NameObject("/N"): self._add_object(dct)} ) - elif "/N" not in cast(DictionaryObject, field[AA.AP]): - cast(DictionaryObject, field[NameObject(AA.AP)])[ + elif "/N" not in cast(DictionaryObject, anno[AA.AP]): + cast(DictionaryObject, anno[NameObject(AA.AP)])[ NameObject("/N") ] = self._add_object(dct) else: # [/AP][/N] exists - n = field[AA.AP]["/N"].indirect_reference.idnum # type: ignore + n = anno[AA.AP]["/N"].indirect_reference.idnum # type: ignore self._objects[n - 1] = dct dct.indirect_reference = IndirectObject(n, 0, self) @@ -906,65 +925,49 @@ def update_page_form_field_values( raise PyPdfError("No /Fields dictionary in Pdf in PdfWriter Object") if isinstance(auto_regenerate, bool): self.set_need_appearances_writer(auto_regenerate) - # Iterate through pages, update field values if PG.ANNOTS not in page: logger_warning("No fields to update on this page", __name__) return - # /Helvetica is just in case of but this is normally insufficient as we miss the font resource - default_da = af.get( - InteractiveFormDictEntries.DA, TextStringObject("/Helvetica 0 Tf 0 g") - ) for writer_annot in page[PG.ANNOTS]: # type: ignore writer_annot = cast(DictionaryObject, writer_annot.get_object()) - # retrieve parent field values, if present - writer_parent_annot = writer_annot.get( - PG.PARENT, DictionaryObject() - ).get_object() + if writer_annot.get("/Subtype", "") != "/Widget": + continue + if "/FT" in writer_annot and "/T" in writer_annot: + writer_parent_annot = writer_annot + else: + writer_parent_annot = writer_annot.get( + PG.PARENT, DictionaryObject() + ).get_object() + for field, value in fields.items(): - if ( - writer_annot.get(FA.T) == field - or self._get_qualified_field_name(writer_annot) == field + if not ( + self._get_qualified_field_name(writer_parent_annot) == field + or writer_parent_annot.get("/T", None) == field ): - if isinstance(value, list): - lst = ArrayObject(TextStringObject(v) for v in value) - writer_annot[NameObject(FA.V)] = lst - else: - writer_annot[NameObject(FA.V)] = TextStringObject(value) - if writer_annot.get(FA.FT) in ("/Btn"): - # case of Checkbox button (no /FT found in Radio widgets - writer_annot[NameObject(AA.AS)] = NameObject(value) - elif ( - writer_annot.get(FA.FT) == "/Tx" - or writer_annot.get(FA.FT) == "/Ch" - ): - # textbox - if AA.DA not in writer_annot: - f = writer_annot - da = default_da - while AA.DA not in f: - f = f.get("/Parent") - if f is None: - break - f = f.get_object() - if AA.DA in f: - da = f[AA.DA] - writer_annot[NameObject(AA.DA)] = da - self._update_text_field(writer_annot) - elif writer_annot.get(FA.FT) == "/Sig": - # signature - logger_warning("Signature forms not implemented yet", __name__) - if flags: - writer_annot[NameObject(FA.Ff)] = NumberObject(flags) + continue + if flags: + writer_annot[NameObject(FA.Ff)] = NumberObject(flags) + if isinstance(value, list): + lst = ArrayObject(TextStringObject(v) for v in value) + writer_parent_annot[NameObject(FA.V)] = lst + else: + writer_parent_annot[NameObject(FA.V)] = TextStringObject(value) + if writer_parent_annot.get(FA.FT) in ("/Btn"): + # case of Checkbox button (no /FT found in Radio widgets + v = NameObject(value) + if v not in writer_annot[NameObject(AA.AP)][NameObject("/N")]: + v = NameObject("/Off") + # other cases will be updated through the for loop + writer_annot[NameObject(AA.AS)] = v elif ( - writer_parent_annot.get(FA.T) == field - or self._get_qualified_field_name(writer_parent_annot) == field + writer_parent_annot.get(FA.FT) == "/Tx" + or writer_parent_annot.get(FA.FT) == "/Ch" ): - writer_parent_annot[NameObject(FA.V)] = TextStringObject(value) - for k in writer_parent_annot[NameObject(FA.Kids)]: - k = k.get_object() - k[NameObject(AA.AS)] = NameObject( - value if value in k[AA.AP]["/N"] else "/Off" - ) + # textbox + self._update_field_annotation(writer_parent_annot, writer_annot) + elif writer_annot.get(FA.FT) == "/Sig": + # signature + logger_warning("Signature forms not implemented yet", __name__) def reattach_fields( self, page: Optional[PageObject] = None @@ -2328,7 +2331,7 @@ def merge( Raises: TypeError: The pages attribute is not configured properly """ - if isinstance(fileobj, PdfReader): + if isinstance(fileobj, PdfDocCommon): reader = fileobj else: stream, encryption_obj = self._create_stream(fileobj) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 838336a16..dac249462 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -367,6 +367,30 @@ def _clone( def raw_get(self, key: Any) -> Any: return dict.__getitem__(self, key) + def get_herited(self, key: str, default: Any = None) -> Any: + """ + Returns the value of a key or from the parent if not found + If not found returns default + + Args: + key: string identifying the field to return + + default: default value to return + + Returns: + current key of herited one else default value + """ + if key in self: + return self[key] + try: + if "/Parent" not in self: + return default + raise KeyError("not present") + except KeyError: + return cast("DictionaryObject", self["/Parent"].get_object()).get_herited( + key, default + ) + def __setitem__(self, key: Any, value: Any) -> Any: if not isinstance(key, PdfObject): raise ValueError("key must be PdfObject") diff --git a/tests/test_writer.py b/tests/test_writer.py index c4ecd5fec..b6da49c59 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1502,6 +1502,88 @@ def test_update_form_fields(tmp_path): Path(write_data_here).unlink() +@pytest.mark.enable_socket() +def test_update_form_fields2(): + myFiles = { + "test1": { + "name": "Test1 Form", + "url": "https://github.com/py-pdf/pypdf/files/14817365/test1.pdf", + "path": "iss2234a.pdf", + "usage": { + "fields": { + "First Name": "Reed", + "Middle Name": "R", + "MM": "04", + "DD": "21", + "YY": "24", + "Initial": "RRG", + # "I DO NOT Agree": null, + # "Last Name": null + }, + }, + }, + "test2": { + "name": "Test2 Form", + "url": "https://github.com/py-pdf/pypdf/files/14817366/test2.pdf", + "path": "iss2234b.pdf", + "usage": { + "fields": { + "p2 First Name": "Joe", + "p2 Middle Name": "S", + "p2 MM": "03", + "p2 DD": "31", + "p2 YY": "24", + "Initial": "JSS", + # "p2 I DO NOT Agree": "null", + "p2 Last Name": "Smith", + "p3 First Name": "John", + "p3 Middle Name": "R", + "p3 MM": "01", + "p3 DD": "25", + "p3 YY": "21", + }, + }, + }, + } + merger = PdfWriter() + + for file in myFiles: + reader = PdfReader( + BytesIO(get_data_from_url(myFiles[file]["url"], name=myFiles[file]["path"])) + ) + reader.add_form_topname(file) + writer = PdfWriter(clone_from=reader) + + for page in writer.pages: + writer.update_page_form_field_values( + page, myFiles[file]["usage"]["fields"], auto_regenerate=False + ) + merger.append(writer) + assert merger.get_form_text_fields(True) == { + "test1.First Name": "Reed", + "test1.Middle Name": "R", + "test1.MM": "04", + "test1.DD": "21", + "test1.YY": "24", + "test1.Initial": "RRG", + "test1.I DO NOT Agree": None, + "test1.Last Name": None, + "test2.p2 First Name": "Joe", + "test2.p2 Middle Name": "S", + "test2.p2 MM": "03", + "test2.p2 DD": "31", + "test2.p2 YY": "24", + "test2.Initial": "JSS", + "test2.p2 I DO NOT Agree": None, + "test2.p2 Last Name": "Smith", + "test2.p3 First Name": "John", + "test2.p3 Middle Name": "R", + "test2.p3 MM": "01", + "test2.p3 DD": "25", + "test2.p3 YY": "21", + } + + @pytest.mark.enable_socket() def test_iss1862(): # The file here has "/B" entry to define the font in a object below the page From 77a8584e43ef07e5d4c6e3e7e69c72473ecbf728 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 2 Apr 2024 13:36:13 +0200 Subject: [PATCH 02/10] Update pypdf/generic/_data_structures.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/generic/_data_structures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index dac249462..7606bf272 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -370,7 +370,7 @@ def raw_get(self, key: Any) -> Any: def get_herited(self, key: str, default: Any = None) -> Any: """ Returns the value of a key or from the parent if not found - If not found returns default + If not found returns default. Args: key: string identifying the field to return From 0c031ffc38fcf64d5c25228b57f6a3ac046e6038 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 2 Apr 2024 13:37:34 +0200 Subject: [PATCH 03/10] Update pypdf/generic/_data_structures.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/generic/_data_structures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 7606bf272..67584d7fa 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -387,7 +387,7 @@ def get_herited(self, key: str, default: Any = None) -> Any: return default raise KeyError("not present") except KeyError: - return cast("DictionaryObject", self["/Parent"].get_object()).get_herited( + return cast("DictionaryObject", self["/Parent"].get_object()).get_inherited( key, default ) From ffb0a8b1e1b7b19e7e38cc66efaa8498dc297bcd Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 2 Apr 2024 13:37:44 +0200 Subject: [PATCH 04/10] Update pypdf/generic/_data_structures.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/generic/_data_structures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 67584d7fa..c1e82fe85 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -367,7 +367,7 @@ def _clone( def raw_get(self, key: Any) -> Any: return dict.__getitem__(self, key) - def get_herited(self, key: str, default: Any = None) -> Any: + def get_inherited(self, key: str, default: Any = None) -> Any: """ Returns the value of a key or from the parent if not found If not found returns default. From da3e89ef49b8d82d87dd831c42f34f4c100879e3 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 2 Apr 2024 13:37:53 +0200 Subject: [PATCH 05/10] Update pypdf/generic/_data_structures.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/generic/_data_structures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index c1e82fe85..07a4ab256 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -369,7 +369,7 @@ def raw_get(self, key: Any) -> Any: def get_inherited(self, key: str, default: Any = None) -> Any: """ - Returns the value of a key or from the parent if not found + Returns the value of a key or from the parent if not found. If not found returns default. Args: From d63fb1e4882be41830bd817a3ecc7261b7dc91d2 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 2 Apr 2024 13:38:02 +0200 Subject: [PATCH 06/10] Update pypdf/_writer.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 17f309664..38d924c99 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -755,7 +755,7 @@ def _update_field_annotation( rct = RectangleObject((0, 0, _rct[2] - _rct[0], _rct[3] - _rct[1])) # Extract font information - da = anno.get_herited( + da = anno.get_inherited( AA.DA, cast(DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]).get( AA.DA, None From c6a262076151c93ee8a41bdae7bc436acf48feae Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 2 Apr 2024 13:39:44 +0200 Subject: [PATCH 07/10] Update pypdf/_writer.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 38d924c99..ee06d63d9 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -825,7 +825,7 @@ def _update_field_annotation( # Retrieve field text and selected values field_flags = field.get(FA.Ff, 0) if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: - txt = "\n".join(anno.get_herited(FA.Opt, {})) + txt = "\n".join(anno.get_inherited(FA.Opt, {})) sel = field.get("/V", []) if not isinstance(sel, list): sel = [sel] From 986cab664bb6ca63161177c9c8c05f6fd4588332 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Tue, 2 Apr 2024 13:41:47 +0200 Subject: [PATCH 08/10] Update pypdf/generic/_data_structures.py --- pypdf/generic/_data_structures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 07a4ab256..d7d9facc9 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -378,7 +378,7 @@ def get_inherited(self, key: str, default: Any = None) -> Any: default: default value to return Returns: - current key of herited one else default value + Current key or inherited one, otherwise default value. """ if key in self: return self[key] From f6c6ccf3aab453d6c6f126a84e07e16ac7e26f58 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 2 Apr 2024 14:04:29 +0200 Subject: [PATCH 09/10] clean up --- pypdf/_writer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index ee06d63d9..75f81f3a4 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -780,7 +780,7 @@ def _update_field_annotation( DictionaryObject, cast( DictionaryObject, - anno.get_herited( + anno.get_inherited( "/DR", cast( DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM] @@ -825,7 +825,7 @@ def _update_field_annotation( # Retrieve field text and selected values field_flags = field.get(FA.Ff, 0) if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: - txt = "\n".join(anno.get_inherited(FA.Opt, {})) + txt = "\n".join(anno.get_inherited(FA.Opt, [])) sel = field.get("/V", []) if not isinstance(sel, list): sel = [sel] From 234af870f251295a8533975214f5dfcaf9c827c0 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 2 Apr 2024 14:28:29 +0200 Subject: [PATCH 10/10] coverage --- pypdf/_writer.py | 4 +++- tests/test_writer.py | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 75f81f3a4..88e280ee5 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -965,7 +965,9 @@ def update_page_form_field_values( ): # textbox self._update_field_annotation(writer_parent_annot, writer_annot) - elif writer_annot.get(FA.FT) == "/Sig": + elif ( + writer_annot.get(FA.FT) == "/Sig" + ): # deprecated # not implemented yet # signature logger_warning("Signature forms not implemented yet", __name__) diff --git a/tests/test_writer.py b/tests/test_writer.py index b6da49c59..baf0134e2 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1499,6 +1499,20 @@ def test_update_form_fields(tmp_path): assert all(x in flds["RadioGroup1"]["/_States_"] for x in ["/1", "/2", "/3"]) assert all(x in flds["Liste1"]["/_States_"] for x in ["Liste1", "Liste2", "Liste3"]) + writer = PdfWriter(clone_from=RESOURCE_ROOT / "FormTestFromOo.pdf") + writer.add_annotation( + page_number=0, + annotation=Link(target_page_index=1, rect=RectangleObject([0, 0, 100, 100])), + ) + del writer.root_object["/AcroForm"]["/Fields"][1].get_object()["/DA"] + del writer.root_object["/AcroForm"]["/Fields"][1].get_object()["/DR"]["/Font"] + writer.update_page_form_field_values( + writer.pages[0], + {"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"}, + auto_regenerate=False, + ) + assert b"/Helv " in writer.pages[0]["/Annots"][1]["/AP"]["/N"].get_data() + Path(write_data_here).unlink() @@ -1556,7 +1570,7 @@ def test_update_form_fields2(): for page in writer.pages: writer.update_page_form_field_values( - page, myFiles[file]["usage"]["fields"], auto_regenerate=False + page, myFiles[file]["usage"]["fields"], auto_regenerate=True ) merger.append(writer) assert merger.get_form_text_fields(True) == {