Skip to content

BUG: Get font information more reliably when removing text #3252

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 47 additions & 13 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2054,7 +2054,7 @@ def remove_objects_from_page(
text_filters: Properties of text to be deleted, if applicable. Optional.
This is a Python dictionary with the following properties:

* font_ids: List of font IDs (such as /F1 or /T1_0) to be deleted.
* font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted.

"""
if isinstance(to_delete, (list, tuple)):
Expand Down Expand Up @@ -2119,8 +2119,9 @@ def clean(
)
):
if (
not to_delete & ObjectDeletionFlag.TEXT
or (not font_ids_to_delete or font_id in font_ids_to_delete)
not to_delete & ObjectDeletionFlag.TEXT
or (to_delete & ObjectDeletionFlag.TEXT and not text_filters)
or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete)
):
del content.operations[i]
else:
Expand Down Expand Up @@ -2246,16 +2247,49 @@ def remove_text(self, font_names: Optional[List[str]] = None) -> None:
font_names = []

for page in self.pages:
font_ids = []
fonts = page.get("/Resources", {}).get("/Font", {})
for font_id, font_info in fonts.items():
font_name = font_info.get("/BaseFont", "").split("+")[-1]
if font_name in font_names:
font_ids.append(font_id)

text_filters = {
"font_ids": font_ids,
}
resource_ids_to_remove = []

# Content streams reference fonts and other resources with names like "/F1" or "/T1_0"
# Font names need to be converted to resource names/IDs for easier removal
if font_names:
# Recursively loop through page objects to gather font info
def get_font_info(
obj: Any,
font_info: Optional[Dict[str, Any]] = None,
key: Optional[str] = None
) -> Dict[str, Any]:
if font_info is None:
font_info = {}
if isinstance(obj, IndirectObject):
obj = obj.get_object()
if isinstance(obj, dict):
if obj.get("/Type") == "/Font":
font_name = obj.get("/BaseFont", "")
# Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold"
normalized_font_name = font_name.lstrip("/").split("+")[-1]
if normalized_font_name not in font_info:
font_info[normalized_font_name] = {
"normalized_font_name": normalized_font_name,
"resource_ids": [],
}
if key not in font_info[normalized_font_name]["resource_ids"]:
font_info[normalized_font_name]["resource_ids"].append(key)
for k in obj:
font_info = get_font_info(obj[k], font_info, k)
elif isinstance(obj, (list, ArrayObject)):
for child_obj in obj:
font_info = get_font_info(child_obj, font_info)
return font_info

# Add relevant resource names for removal
font_info = get_font_info(page.get("/Resources"))
for font_name in font_names:
if font_name in font_info:
resource_ids_to_remove.extend(font_info[font_name]["resource_ids"])

text_filters = {}
if font_names:
text_filters["font_ids"] = resource_ids_to_remove
self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)

def add_uri(
Expand Down
26 changes: 22 additions & 4 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1386,22 +1386,40 @@ def test_new_removes():
assert b"Chap" not in bb
assert b" TJ" not in bb

# Test removing text in a specified font
writer = PdfWriter()
writer.clone_document_from_reader(reader)
b = BytesIO()
writer.write(b)
reader = PdfReader(b)
text = reader.pages[0].extract_text()
temp_reader = PdfReader(b)
text = temp_reader.pages[0].extract_text()
assert "Arbeitsschritt" in text
assert "Modelltechnik" in text
writer.remove_text(font_names=["LiberationSans-Bold"])
b = BytesIO()
writer.write(b)
reader = PdfReader(b)
text = reader.pages[0].extract_text()
temp_reader = PdfReader(b)
text = temp_reader.pages[0].extract_text()
assert "Arbeitsschritt" not in text
assert "Modelltechnik" in text

# Test removing text in a specified font that doesn't exist (nothing should happen)
writer = PdfWriter()
writer.clone_document_from_reader(reader)
b = BytesIO()
writer.write(b)
temp_reader = PdfReader(b)
text = temp_reader.pages[0].extract_text()
assert "Arbeitsschritt" in text
assert "Modelltechnik" in text
writer.remove_text(font_names=["ComicSans-Oblique"])
b = BytesIO()
writer.write(b)
temp_reader = PdfReader(b)
text = temp_reader.pages[0].extract_text()
assert "Arbeitsschritt" in text
assert "Modelltechnik" in text

url = "https://github.com/py-pdf/pypdf/files/10832029/tt2.pdf"
name = "GeoBaseWithComments.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
Expand Down
Loading