ENH: Add ability to add font resources for 14 Adobe Core fonts in text widget annotations (#3624)

PJBrs · web-flow · commit cf33a60c5a9f · 2026-02-06T11:17:35.000+01:00
We used to overwrite a text appearance stream's resource dictionary
when we initiated it from an annotation. This would then overwrite
a font resource if we had previously added it. Make sure that we
merge our new font resource into the annotation's resources instead.
diff --git a/pypdf/_font.py b/pypdf/_font.py
@@ -2,7 +2,7 @@
 from dataclasses import dataclass, field
 from typing import Any, Union, cast
 
-from pypdf.generic import ArrayObject, DictionaryObject
+from pypdf.generic import ArrayObject, DictionaryObject, NameObject
 
 from ._cmap import get_encoding
 from ._codecs.adobe_glyphs import adobe_glyphs
@@ -302,6 +302,18 @@ def from_font_resource(
             interpretable=interpretable
         )
 
+    def as_font_resource(self) -> DictionaryObject:
+        # For now, this returns a font resource that only works with the 14 Adobe Core fonts.
+        return (
+            DictionaryObject({
+                NameObject("/Subtype"): NameObject("/Type1"),
+                NameObject("/Name"): NameObject(f"/{self.name}"),
+                NameObject("/Type"): NameObject("/Font"),
+                NameObject("/BaseFont"): NameObject(f"/{self.name}"),
+                NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
+            })
+        )
+
     def text_width(self, text: str = "") -> float:
         """Sum of character widths specified in PDF font for the supplied text."""
         return sum(
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -902,12 +902,14 @@ def _add_apstream_object(
             ap_stream_res = cast(DictionaryObject, appearance_stream_obj["/Resources"])
             ap_stream_font_dict = cast(DictionaryObject, ap_stream_res.get("/Font", DictionaryObject()))
             if "/Font" not in pg_res:
-                pg_res[NameObject("/Font")] = DictionaryObject()
-            pg_font_res = cast(DictionaryObject, pg_res["/Font"])
+                font_dict_ref = self._add_object(DictionaryObject())
+                pg_res[NameObject("/Font")] = font_dict_ref
+            pg_font_res = cast(DictionaryObject, pg_res["/Font"].get_object())
             # Merge fonts from the appearance stream into the page's font resources
-            for font_name, font_ref in ap_stream_font_dict.items():
+            for font_name, font_res in ap_stream_font_dict.items():
                 if font_name not in pg_font_res:
-                    pg_font_res[font_name] = font_ref
+                    font_res_ref = self._add_object(font_res)
+                    pg_font_res[font_name] = font_res_ref
         # Always add the resolved stream object to the writer to get a new IndirectObject.
         # This ensures we have a valid IndirectObject managed by *this* writer.
         xobject_ref = self._add_object(appearance_stream_obj)
diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py
@@ -343,25 +343,43 @@ def __init__(
 
         # If a font resource was added, get the font character map
         if font_resource:
-            font_resource = cast(DictionaryObject, font_resource.get_object())
             font = Font.from_font_resource(font_resource)
         else:
             logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__)
             font_name = "/Helv"
-            font_resource = DictionaryObject({
-                NameObject("/Subtype"): NameObject("/Type1"),
-                NameObject("/Name"): NameObject("/Helv"),
-                NameObject("/Type"): NameObject("/Font"),
-                NameObject("/BaseFont"): NameObject("/Helvetica"),
-                NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
-            })
+            core_font_metrics = CORE_FONT_METRICS["Helvetica"]
             font = Font(
                 name="Helvetica",
                 character_map={},
                 encoding=dict(zip(range(256), fill_from_encoding("cp1252"))),  # WinAnsiEncoding
                 sub_type="Type1",
-                font_descriptor=CORE_FONT_METRICS["Helvetica"].font_descriptor,
-                character_widths=CORE_FONT_METRICS["Helvetica"].character_widths
+                font_descriptor=core_font_metrics.font_descriptor,
+                character_widths=core_font_metrics.character_widths
+            )
+            font_resource = font.as_font_resource()
+
+        # Check whether the font resource is able to encode the text value.
+        encodable = True
+        try:
+            if isinstance(font.encoding, str):
+                text.encode(font.encoding, "surrogatepass")
+            else:
+                supported_chars = set(font.encoding.values())
+                if any(char not in supported_chars for char in text):
+                    encodable = False
+            # We should add a final check against the character_map (CMap) of the font,
+            # but we don't appear to have PDF forms with such fonts, so we skip this for
+            # now.
+
+        except UnicodeEncodeError:
+            encodable = False
+
+        if not encodable:
+            logger_warning(
+                f"Text string '{text}' contains characters not supported by font encoding. "
+                "This may result in text corruption. "
+                "Consider calling writer.update_page_form_field_values with auto_regenerate=True.",
+                __name__
             )
 
         font_glyph_byte_map: dict[str, bytes]
@@ -398,6 +416,44 @@ def __init__(
             })
         })
 
+    @staticmethod
+    def _find_annotation_font_resource(
+            font_name: str,
+            annotation: DictionaryObject,
+            acro_form: DictionaryObject
+        ) -> tuple[str, DictionaryObject]:
+        # Try to find a resource dictionary for the font by examining the annotation and, if that fails,
+        # the AcroForm resources dictionary
+        acro_form_resources: Any = cast(
+            DictionaryObject,
+            annotation.get_inherited(
+                "/DR",
+                acro_form.get("/DR", DictionaryObject()),
+            ),
+        )
+        acro_form_font_resources = acro_form_resources.get("/Font", DictionaryObject())
+        font_resource = acro_form_font_resources.get(font_name, None)
+
+        # Normally, we should have found a font resource by now. However, when a user has provided a specific
+        # font name, we may not have found the associated font resource among the AcroForm resources. Also, in
+        # case of the 14 Adobe Core fonts, we may be expected to construct a font resource ourselves.
+        if is_null_or_none(font_resource):
+            if font_name.removeprefix("/") not in CORE_FONT_METRICS:
+                # Default to Helvetica if we haven't found a font resource and cannot construct one ourselves.
+                logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__)
+                font_name = "/Helvetica"
+            core_font_metrics = CORE_FONT_METRICS[font_name.removeprefix("/")]
+            font_resource = Font(
+                name=font_name.removeprefix("/"),
+                character_map={},
+                encoding=dict(zip(range(256), fill_from_encoding("cp1252"))),  # WinAnsiEncoding
+                sub_type="Type1",
+                font_descriptor=core_font_metrics.font_descriptor,
+                character_widths=core_font_metrics.character_widths
+            ).as_font_resource()
+
+        return font_name, font_resource
+
     @classmethod
     def from_text_annotation(
         cls,
@@ -443,6 +499,23 @@ def from_text_annotation(
         else:
             default_appearance = default_appearance.get_object()
 
+        # Retrieve field text and selected values
+        field_flags = field.get(FieldDictionaryAttributes.Ff, 0)
+        if (
+                field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and
+                field_flags & FieldDictionaryAttributes.FfBits.Combo == 0
+        ):
+            text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, []))
+            selection = field.get("/V", [])
+            if not isinstance(selection, list):
+                selection = [selection]
+        else:  # /Tx
+            text = field.get("/V", "")
+            selection = []
+
+        # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)
+        text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")
+
         # Derive font name, size and color from the default appearance. Also set
         # user-provided font name and font size in the default appearance, if given.
         # For a font name, this presumes that we can find an associated font resource
@@ -463,46 +536,7 @@ def from_text_annotation(
         if user_font_size > 0:
             font_size = user_font_size
 
-        # Try to find a resource dictionary for the font
-        document_resources: Any = cast(
-            DictionaryObject,
-            cast(
-                DictionaryObject,
-                annotation.get_inherited(
-                    "/DR",
-                    acro_form.get("/DR", DictionaryObject()),
-                ),
-            ).get_object(),
-        )
-        document_font_resources = document_resources.get("/Font", DictionaryObject()).get_object()
-        # CORE_FONT_METRICS is the dict with Standard font metrics
-        if font_name not in document_font_resources and font_name.removeprefix("/") not in CORE_FONT_METRICS:
-            # ...or AcroForm dictionary
-            document_resources = cast(
-                dict[Any, Any],
-                acro_form.get("/DR", {}),
-            )
-            document_font_resources = document_resources.get_object().get("/Font", DictionaryObject()).get_object()
-        font_resource = document_font_resources.get(font_name, None)
-        if not is_null_or_none(font_resource):
-            font_resource = cast(DictionaryObject, font_resource.get_object())
-
-        # Retrieve field text and selected values
-        field_flags = field.get(FieldDictionaryAttributes.Ff, 0)
-        if (
-                field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and
-                field_flags & FieldDictionaryAttributes.FfBits.Combo == 0
-        ):
-            text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, []))
-            selection = field.get("/V", [])
-            if not isinstance(selection, list):
-                selection = [selection]
-        else:  # /Tx
-            text = field.get("/V", "")
-            selection = []
-
-        # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)
-        text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")
+        font_name, font_resource = cls._find_annotation_font_resource(font_name, annotation, acro_form)
 
         # Retrieve formatting information
         is_comb = False
@@ -535,11 +569,21 @@ def from_text_annotation(
             is_comb=is_comb,
             max_length=max_length
         )
+
         if AnnotationDictionaryAttributes.AP in annotation:
             for key, value in (
                 cast(DictionaryObject, annotation[AnnotationDictionaryAttributes.AP]).get("/N", {}).items()
             ):
-                if key not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:
+                if key in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:
+                    continue
+                # Don't overwrite font resources added by TextAppearanceStream.__init__
+                if key == "/Resources":
+                    if "/Font" not in value:
+                        value.get_object()[NameObject("/Font")] = DictionaryObject()
+                    value["/Font"].get_object()[NameObject(font_name)] = getattr(
+                        font_resource, "indirect_reference", font_resource
+                    )
+                else:
                     new_appearance_stream[key] = value
 
         return new_appearance_stream
diff --git a/tests/test_writer.py b/tests/test_writer.py
@@ -1506,7 +1506,7 @@ def test_named_dest_page_number():
     assert len(writer.root_object["/Names"]["/Dests"]["/Names"]) == 6
 
 
-def test_update_form_fields(tmp_path):
+def test_update_form_fields(caplog, tmp_path):
     write_data_here = tmp_path / "out.pdf"
     writer = PdfWriter(clone_from=RESOURCE_ROOT / "FormTestFromOo.pdf")
     writer.update_page_form_field_values(
@@ -1572,10 +1572,11 @@ def test_update_form_fields(tmp_path):
     del writer.root_object["/AcroForm"]["/Fields"][1].get_object()["/DR"]["/Font"]
     writer.update_page_form_field_values(
         [writer.pages[0], writer.pages[1]],
-        {"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"},
+        {"Text1": "!مرحبا بالعالم", "Text2": "ligne1\nligne2\nligne3"},
         auto_regenerate=False,
     )
-    assert b"/Helv " in writer.pages[1]["/Annots"][1]["/AP"]["/N"].get_data()
+    assert b"/Helvetica " in writer.pages[1]["/Annots"][1]["/AP"]["/N"].get_data()
+    assert "Text string '!مرحبا بالعالم' contains characters not supported by font encoding." in caplog.text
     writer.update_page_form_field_values(
         None,
         {"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"},
@@ -1646,7 +1647,7 @@ def test_merge_content_stream_to_page():
 
 
 @pytest.mark.enable_socket
-def test_update_form_fields2():
+def test_update_form_fields2(caplog):
     my_files = {
         "test1": {
             "name": "Test1 Form",
@@ -1679,7 +1680,7 @@ def test_update_form_fields2():
                     "Initial": "JSS",
                     # "p2 I DO NOT Agree": "null",
                     "p2 Last Name": "Smith",
-                    "p3 First Name": "John",
+                    "p3 First Name": "شهرزاد",
                     "p3 Middle Name": "R",
                     "p3 MM": "01",
                     "p3 DD": "25",
@@ -1718,12 +1719,13 @@ def test_update_form_fields2():
         "test2.Initial": "JSS",
         "test2.p2 I DO NOT Agree": None,
         "test2.p2 Last Name": "Smith",
-        "test2.p3 First Name": "John",
+        "test2.p3 First Name": "شهرزاد",
         "test2.p3 Middle Name": "R",
         "test2.p3 MM": "01",
         "test2.p3 DD": "25",
         "test2.p3 YY": "21",
     }
+    assert "Text string 'شهرزاد' contains characters not supported by font encoding." in caplog.text
 
 
 @pytest.mark.enable_socket
@@ -2411,7 +2413,7 @@ def test_selfont():
 
 
 @pytest.mark.enable_socket
-def test_no_resource_for_14_std_fonts(caplog):
+def test_no_resource_for_14_std_fonts():
     """Cf #2670"""
     url = "https://github.com/py-pdf/pypdf/files/15405390/f1040.pdf"
     name = "iss2670.pdf"
@@ -2423,7 +2425,7 @@ def test_no_resource_for_14_std_fonts(caplog):
             writer.update_page_form_field_values(
                 p, {a["/T"]: "Brooks"}, auto_regenerate=False
             )
-    assert "Font dictionary for /Helvetica not found; defaulting to Helvetica." in caplog.text
+            assert "/Helvetica" in a["/AP"]["/N"]["/Resources"]["/Font"]
 
 
 @pytest.mark.enable_socket
@@ -2435,7 +2437,7 @@ def test_field_box_upside_down():
     writer.update_page_form_field_values(None, {"FreightTrainMiles": "0"})
     assert writer.pages[0]["/Annots"][13].get_object()["/AP"]["/N"].get_data() == (
         b"q\n/Tx BMC \nq\n2 1 102.29520000000001 9.835000000000036 re\n"
-        b"W\nBT\n/Helv 8.0 Tf 0 g\n2 3.0455000000000183 Td\n(0) Tj\nET\n"
+        b"W\nBT\n/Arial 8.0 Tf 0 g\n2 3.0455000000000183 Td\n(0) Tj\nET\n"
         b"Q\nEMC\nQ\n"
     )
     box = writer.pages[0]["/Annots"][13].get_object()["/AP"]["/N"]["/BBox"]