Skip to content

Commit cf33a60

Browse files
authored
ENH: Add ability to add font resources for 14 Adobe Core fonts in text widget annotations (#3624)
We used to overwrite a text appearance stream's resource dictionary when we initiated it from an annotation. This would then overwrite a font resource if we had previously added it. Make sure that we merge our new font resource into the annotation's resources instead.
1 parent 3b94ce8 commit cf33a60

File tree

4 files changed

+125
-65
lines changed

4 files changed

+125
-65
lines changed

pypdf/_font.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from dataclasses import dataclass, field
33
from typing import Any, Union, cast
44

5-
from pypdf.generic import ArrayObject, DictionaryObject
5+
from pypdf.generic import ArrayObject, DictionaryObject, NameObject
66

77
from ._cmap import get_encoding
88
from ._codecs.adobe_glyphs import adobe_glyphs
@@ -302,6 +302,18 @@ def from_font_resource(
302302
interpretable=interpretable
303303
)
304304

305+
def as_font_resource(self) -> DictionaryObject:
306+
# For now, this returns a font resource that only works with the 14 Adobe Core fonts.
307+
return (
308+
DictionaryObject({
309+
NameObject("/Subtype"): NameObject("/Type1"),
310+
NameObject("/Name"): NameObject(f"/{self.name}"),
311+
NameObject("/Type"): NameObject("/Font"),
312+
NameObject("/BaseFont"): NameObject(f"/{self.name}"),
313+
NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
314+
})
315+
)
316+
305317
def text_width(self, text: str = "") -> float:
306318
"""Sum of character widths specified in PDF font for the supplied text."""
307319
return sum(

pypdf/_writer.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -902,12 +902,14 @@ def _add_apstream_object(
902902
ap_stream_res = cast(DictionaryObject, appearance_stream_obj["/Resources"])
903903
ap_stream_font_dict = cast(DictionaryObject, ap_stream_res.get("/Font", DictionaryObject()))
904904
if "/Font" not in pg_res:
905-
pg_res[NameObject("/Font")] = DictionaryObject()
906-
pg_font_res = cast(DictionaryObject, pg_res["/Font"])
905+
font_dict_ref = self._add_object(DictionaryObject())
906+
pg_res[NameObject("/Font")] = font_dict_ref
907+
pg_font_res = cast(DictionaryObject, pg_res["/Font"].get_object())
907908
# Merge fonts from the appearance stream into the page's font resources
908-
for font_name, font_ref in ap_stream_font_dict.items():
909+
for font_name, font_res in ap_stream_font_dict.items():
909910
if font_name not in pg_font_res:
910-
pg_font_res[font_name] = font_ref
911+
font_res_ref = self._add_object(font_res)
912+
pg_font_res[font_name] = font_res_ref
911913
# Always add the resolved stream object to the writer to get a new IndirectObject.
912914
# This ensures we have a valid IndirectObject managed by *this* writer.
913915
xobject_ref = self._add_object(appearance_stream_obj)

pypdf/generic/_appearance_stream.py

Lines changed: 95 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -343,25 +343,43 @@ def __init__(
343343

344344
# If a font resource was added, get the font character map
345345
if font_resource:
346-
font_resource = cast(DictionaryObject, font_resource.get_object())
347346
font = Font.from_font_resource(font_resource)
348347
else:
349348
logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__)
350349
font_name = "/Helv"
351-
font_resource = DictionaryObject({
352-
NameObject("/Subtype"): NameObject("/Type1"),
353-
NameObject("/Name"): NameObject("/Helv"),
354-
NameObject("/Type"): NameObject("/Font"),
355-
NameObject("/BaseFont"): NameObject("/Helvetica"),
356-
NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
357-
})
350+
core_font_metrics = CORE_FONT_METRICS["Helvetica"]
358351
font = Font(
359352
name="Helvetica",
360353
character_map={},
361354
encoding=dict(zip(range(256), fill_from_encoding("cp1252"))), # WinAnsiEncoding
362355
sub_type="Type1",
363-
font_descriptor=CORE_FONT_METRICS["Helvetica"].font_descriptor,
364-
character_widths=CORE_FONT_METRICS["Helvetica"].character_widths
356+
font_descriptor=core_font_metrics.font_descriptor,
357+
character_widths=core_font_metrics.character_widths
358+
)
359+
font_resource = font.as_font_resource()
360+
361+
# Check whether the font resource is able to encode the text value.
362+
encodable = True
363+
try:
364+
if isinstance(font.encoding, str):
365+
text.encode(font.encoding, "surrogatepass")
366+
else:
367+
supported_chars = set(font.encoding.values())
368+
if any(char not in supported_chars for char in text):
369+
encodable = False
370+
# We should add a final check against the character_map (CMap) of the font,
371+
# but we don't appear to have PDF forms with such fonts, so we skip this for
372+
# now.
373+
374+
except UnicodeEncodeError:
375+
encodable = False
376+
377+
if not encodable:
378+
logger_warning(
379+
f"Text string '{text}' contains characters not supported by font encoding. "
380+
"This may result in text corruption. "
381+
"Consider calling writer.update_page_form_field_values with auto_regenerate=True.",
382+
__name__
365383
)
366384

367385
font_glyph_byte_map: dict[str, bytes]
@@ -398,6 +416,44 @@ def __init__(
398416
})
399417
})
400418

419+
@staticmethod
420+
def _find_annotation_font_resource(
421+
font_name: str,
422+
annotation: DictionaryObject,
423+
acro_form: DictionaryObject
424+
) -> tuple[str, DictionaryObject]:
425+
# Try to find a resource dictionary for the font by examining the annotation and, if that fails,
426+
# the AcroForm resources dictionary
427+
acro_form_resources: Any = cast(
428+
DictionaryObject,
429+
annotation.get_inherited(
430+
"/DR",
431+
acro_form.get("/DR", DictionaryObject()),
432+
),
433+
)
434+
acro_form_font_resources = acro_form_resources.get("/Font", DictionaryObject())
435+
font_resource = acro_form_font_resources.get(font_name, None)
436+
437+
# Normally, we should have found a font resource by now. However, when a user has provided a specific
438+
# font name, we may not have found the associated font resource among the AcroForm resources. Also, in
439+
# case of the 14 Adobe Core fonts, we may be expected to construct a font resource ourselves.
440+
if is_null_or_none(font_resource):
441+
if font_name.removeprefix("/") not in CORE_FONT_METRICS:
442+
# Default to Helvetica if we haven't found a font resource and cannot construct one ourselves.
443+
logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__)
444+
font_name = "/Helvetica"
445+
core_font_metrics = CORE_FONT_METRICS[font_name.removeprefix("/")]
446+
font_resource = Font(
447+
name=font_name.removeprefix("/"),
448+
character_map={},
449+
encoding=dict(zip(range(256), fill_from_encoding("cp1252"))), # WinAnsiEncoding
450+
sub_type="Type1",
451+
font_descriptor=core_font_metrics.font_descriptor,
452+
character_widths=core_font_metrics.character_widths
453+
).as_font_resource()
454+
455+
return font_name, font_resource
456+
401457
@classmethod
402458
def from_text_annotation(
403459
cls,
@@ -443,6 +499,23 @@ def from_text_annotation(
443499
else:
444500
default_appearance = default_appearance.get_object()
445501

502+
# Retrieve field text and selected values
503+
field_flags = field.get(FieldDictionaryAttributes.Ff, 0)
504+
if (
505+
field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and
506+
field_flags & FieldDictionaryAttributes.FfBits.Combo == 0
507+
):
508+
text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, []))
509+
selection = field.get("/V", [])
510+
if not isinstance(selection, list):
511+
selection = [selection]
512+
else: # /Tx
513+
text = field.get("/V", "")
514+
selection = []
515+
516+
# Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)
517+
text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")
518+
446519
# Derive font name, size and color from the default appearance. Also set
447520
# user-provided font name and font size in the default appearance, if given.
448521
# For a font name, this presumes that we can find an associated font resource
@@ -463,46 +536,7 @@ def from_text_annotation(
463536
if user_font_size > 0:
464537
font_size = user_font_size
465538

466-
# Try to find a resource dictionary for the font
467-
document_resources: Any = cast(
468-
DictionaryObject,
469-
cast(
470-
DictionaryObject,
471-
annotation.get_inherited(
472-
"/DR",
473-
acro_form.get("/DR", DictionaryObject()),
474-
),
475-
).get_object(),
476-
)
477-
document_font_resources = document_resources.get("/Font", DictionaryObject()).get_object()
478-
# CORE_FONT_METRICS is the dict with Standard font metrics
479-
if font_name not in document_font_resources and font_name.removeprefix("/") not in CORE_FONT_METRICS:
480-
# ...or AcroForm dictionary
481-
document_resources = cast(
482-
dict[Any, Any],
483-
acro_form.get("/DR", {}),
484-
)
485-
document_font_resources = document_resources.get_object().get("/Font", DictionaryObject()).get_object()
486-
font_resource = document_font_resources.get(font_name, None)
487-
if not is_null_or_none(font_resource):
488-
font_resource = cast(DictionaryObject, font_resource.get_object())
489-
490-
# Retrieve field text and selected values
491-
field_flags = field.get(FieldDictionaryAttributes.Ff, 0)
492-
if (
493-
field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and
494-
field_flags & FieldDictionaryAttributes.FfBits.Combo == 0
495-
):
496-
text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, []))
497-
selection = field.get("/V", [])
498-
if not isinstance(selection, list):
499-
selection = [selection]
500-
else: # /Tx
501-
text = field.get("/V", "")
502-
selection = []
503-
504-
# Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)
505-
text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")
539+
font_name, font_resource = cls._find_annotation_font_resource(font_name, annotation, acro_form)
506540

507541
# Retrieve formatting information
508542
is_comb = False
@@ -535,11 +569,21 @@ def from_text_annotation(
535569
is_comb=is_comb,
536570
max_length=max_length
537571
)
572+
538573
if AnnotationDictionaryAttributes.AP in annotation:
539574
for key, value in (
540575
cast(DictionaryObject, annotation[AnnotationDictionaryAttributes.AP]).get("/N", {}).items()
541576
):
542-
if key not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:
577+
if key in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:
578+
continue
579+
# Don't overwrite font resources added by TextAppearanceStream.__init__
580+
if key == "/Resources":
581+
if "/Font" not in value:
582+
value.get_object()[NameObject("/Font")] = DictionaryObject()
583+
value["/Font"].get_object()[NameObject(font_name)] = getattr(
584+
font_resource, "indirect_reference", font_resource
585+
)
586+
else:
543587
new_appearance_stream[key] = value
544588

545589
return new_appearance_stream

tests/test_writer.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1506,7 +1506,7 @@ def test_named_dest_page_number():
15061506
assert len(writer.root_object["/Names"]["/Dests"]["/Names"]) == 6
15071507

15081508

1509-
def test_update_form_fields(tmp_path):
1509+
def test_update_form_fields(caplog, tmp_path):
15101510
write_data_here = tmp_path / "out.pdf"
15111511
writer = PdfWriter(clone_from=RESOURCE_ROOT / "FormTestFromOo.pdf")
15121512
writer.update_page_form_field_values(
@@ -1572,10 +1572,11 @@ def test_update_form_fields(tmp_path):
15721572
del writer.root_object["/AcroForm"]["/Fields"][1].get_object()["/DR"]["/Font"]
15731573
writer.update_page_form_field_values(
15741574
[writer.pages[0], writer.pages[1]],
1575-
{"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"},
1575+
{"Text1": "!مرحبا بالعالم", "Text2": "ligne1\nligne2\nligne3"},
15761576
auto_regenerate=False,
15771577
)
1578-
assert b"/Helv " in writer.pages[1]["/Annots"][1]["/AP"]["/N"].get_data()
1578+
assert b"/Helvetica " in writer.pages[1]["/Annots"][1]["/AP"]["/N"].get_data()
1579+
assert "Text string '!مرحبا بالعالم' contains characters not supported by font encoding." in caplog.text
15791580
writer.update_page_form_field_values(
15801581
None,
15811582
{"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"},
@@ -1646,7 +1647,7 @@ def test_merge_content_stream_to_page():
16461647

16471648

16481649
@pytest.mark.enable_socket
1649-
def test_update_form_fields2():
1650+
def test_update_form_fields2(caplog):
16501651
my_files = {
16511652
"test1": {
16521653
"name": "Test1 Form",
@@ -1679,7 +1680,7 @@ def test_update_form_fields2():
16791680
"Initial": "JSS",
16801681
# "p2 I DO NOT Agree": "null",
16811682
"p2 Last Name": "Smith",
1682-
"p3 First Name": "John",
1683+
"p3 First Name": "شهرزاد",
16831684
"p3 Middle Name": "R",
16841685
"p3 MM": "01",
16851686
"p3 DD": "25",
@@ -1718,12 +1719,13 @@ def test_update_form_fields2():
17181719
"test2.Initial": "JSS",
17191720
"test2.p2 I DO NOT Agree": None,
17201721
"test2.p2 Last Name": "Smith",
1721-
"test2.p3 First Name": "John",
1722+
"test2.p3 First Name": "شهرزاد",
17221723
"test2.p3 Middle Name": "R",
17231724
"test2.p3 MM": "01",
17241725
"test2.p3 DD": "25",
17251726
"test2.p3 YY": "21",
17261727
}
1728+
assert "Text string 'شهرزاد' contains characters not supported by font encoding." in caplog.text
17271729

17281730

17291731
@pytest.mark.enable_socket
@@ -2411,7 +2413,7 @@ def test_selfont():
24112413

24122414

24132415
@pytest.mark.enable_socket
2414-
def test_no_resource_for_14_std_fonts(caplog):
2416+
def test_no_resource_for_14_std_fonts():
24152417
"""Cf #2670"""
24162418
url = "https://github.com/py-pdf/pypdf/files/15405390/f1040.pdf"
24172419
name = "iss2670.pdf"
@@ -2423,7 +2425,7 @@ def test_no_resource_for_14_std_fonts(caplog):
24232425
writer.update_page_form_field_values(
24242426
p, {a["/T"]: "Brooks"}, auto_regenerate=False
24252427
)
2426-
assert "Font dictionary for /Helvetica not found; defaulting to Helvetica." in caplog.text
2428+
assert "/Helvetica" in a["/AP"]["/N"]["/Resources"]["/Font"]
24272429

24282430

24292431
@pytest.mark.enable_socket
@@ -2435,7 +2437,7 @@ def test_field_box_upside_down():
24352437
writer.update_page_form_field_values(None, {"FreightTrainMiles": "0"})
24362438
assert writer.pages[0]["/Annots"][13].get_object()["/AP"]["/N"].get_data() == (
24372439
b"q\n/Tx BMC \nq\n2 1 102.29520000000001 9.835000000000036 re\n"
2438-
b"W\nBT\n/Helv 8.0 Tf 0 g\n2 3.0455000000000183 Td\n(0) Tj\nET\n"
2440+
b"W\nBT\n/Arial 8.0 Tf 0 g\n2 3.0455000000000183 Td\n(0) Tj\nET\n"
24392441
b"Q\nEMC\nQ\n"
24402442
)
24412443
box = writer.pages[0]["/Annots"][13].get_object()["/AP"]["/N"]["/BBox"]

0 commit comments

Comments
 (0)