From c66a7e48ed6f7f496b28fd38949ffc866a6a7149 Mon Sep 17 00:00:00 2001 From: Trevor Elkins Date: Tue, 29 Jul 2025 16:19:56 -0400 Subject: [PATCH 1/2] Add PDF image optimization --- requirements.txt | 1 + scripts/test_image_optimization.py | 124 ++++++++++++++---- .../size/insights/apple/image_optimization.py | 26 +++- 3 files changed, 121 insertions(+), 30 deletions(-) diff --git a/requirements.txt b/requirements.txt index c135af92..c8af2afa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,3 +28,4 @@ asn1crypto>=1.5.0 cryptography>=41.0.0 datadog==0.51.* lzfse>=0.4.2 +PyMuPDF>=1.23.0 diff --git a/scripts/test_image_optimization.py b/scripts/test_image_optimization.py index cf25d7fb..4fb7469f 100755 --- a/scripts/test_image_optimization.py +++ b/scripts/test_image_optimization.py @@ -11,6 +11,7 @@ from typing import List, NamedTuple import pillow_heif # type: ignore +import fitz # type: ignore from PIL import Image # Register HEIF support @@ -85,6 +86,27 @@ def test_heic_conversion(img: Image.Image, file_size: int) -> OptimizationResult return None +def test_pdf_optimization(pdf_path: Path, file_size: int) -> OptimizationResult | None: + """Test PDF optimization.""" + try: + doc = fitz.open(pdf_path) + optimized_bytes = doc.tobytes(garbage=4, deflate=True) + doc.close() + new_size = len(optimized_bytes) + if new_size < file_size: + savings = file_size - new_size + return OptimizationResult( + original_size=file_size, + optimized_size=new_size, + savings=savings, + savings_percent=(savings / file_size) * 100, + method="optimized_PDF" + ) + except Exception as e: + print(f" āŒ PDF optimization failed: {e}") + return None + + def save_optimized_image(img: Image.Image, output_path: Path, method: str) -> None: """Save the optimized image to disk.""" try: @@ -99,17 +121,65 @@ def save_optimized_image(img: Image.Image, output_path: Path, method: str) -> No print(f" āŒ Failed to save optimized image: {e}") -def process_image(image_path: Path, output_dir: Path) -> None: - """Process a single image and save optimized versions.""" - print(f"\nšŸ“ø Processing: {image_path.name}") +def save_optimized_pdf(pdf_path: Path, output_path: Path) -> None: + """Save the optimized PDF to disk.""" + try: + doc = fitz.open(pdf_path) + optimized_bytes = doc.tobytes(garbage=4, deflate=True) + doc.close() + with open(output_path, 'wb') as f: + f.write(optimized_bytes) + except Exception as e: + print(f" āŒ Failed to save optimized PDF: {e}") + + +def process_file(file_path: Path, output_dir: Path) -> None: + """Process a single file (image or PDF) and save optimized versions.""" + print(f"\nšŸ“„ Processing: {file_path.name}") try: - file_size = image_path.stat().st_size + file_size = file_path.stat().st_size print(f" šŸ“Š Original size: {format_size(file_size)}") - with Image.open(image_path) as img: + file_ext = file_path.suffix.lower() + + # Handle PDF files + if file_ext == '.pdf': + print(f" šŸ·ļø Format: PDF") + results: List[OptimizationResult] = [] + + if result := test_pdf_optimization(file_path, file_size): + results.append(result) + + if not results: + print(" ā„¹ļø No optimization opportunities found") + return + + # Show results + print(" šŸ’” Optimization opportunities:") + for i, result in enumerate(results, 1): + print(f" {i}. {result.method}: " + f"{format_size(result.savings)} saved " + f"({result.savings_percent:.1f}%) " + f"→ {format_size(result.optimized_size)}") + + # Save optimized PDF + best_result = results[0] + if best_result.savings >= 4096: # 4KB threshold + output_path = output_dir / f"{file_path.stem}_optimized.pdf" + save_optimized_pdf(file_path, output_path) + print(f" āœ… Saved optimized version: {output_path.name}") + print(f" šŸŽÆ Savings: {format_size(best_result.savings)} " + f"({best_result.savings_percent:.1f}%)") + else: + print(f" āš ļø Savings ({format_size(best_result.savings)}) " + f"below 4KB threshold") + return + + # Handle image files + with Image.open(file_path) as img: img.load() # type: ignore - fmt = (img.format or image_path.suffix[1:]).lower() + fmt = (img.format or file_path.suffix[1:]).lower() print(f" šŸ·ļø Format: {fmt.upper()}, Mode: {img.mode}, Size: {img.size}") results: List[OptimizationResult] = [] @@ -145,13 +215,13 @@ def process_image(image_path: Path, output_dir: Path) -> None: # Save the best optimization best_result = max(results, key=lambda r: r.savings) if best_result.savings >= 4096: # 4KB threshold - stem = image_path.stem + stem = file_path.stem if best_result.method == "HEIC": output_path = output_dir / f"{stem}_optimized.heic" elif best_result.method.startswith("minified_"): - output_path = output_dir / f"{stem}_optimized{image_path.suffix}" + output_path = output_dir / f"{stem}_optimized{file_path.suffix}" else: - output_path = output_dir / f"{stem}_optimized{image_path.suffix}" + output_path = output_dir / f"{stem}_optimized{file_path.suffix}" save_optimized_image(img, output_path, best_result.method) print(f" āœ… Saved optimized version: {output_path.name}") @@ -162,18 +232,18 @@ def process_image(image_path: Path, output_dir: Path) -> None: f"below 4KB threshold") except Exception as e: - print(f" āŒ Failed to process image: {e}") + print(f" āŒ Failed to process file: {e}") def main(): """Main function.""" if len(sys.argv) != 2: - print("Usage: python test_image_optimization.py ") + print("Usage: python test_image_optimization.py ") print("\nThis script will:") - print("- Analyze images for optimization opportunities") + print("- Analyze images and PDFs for optimization opportunities") print("- Save optimized versions with '_optimized' suffix") print("- Show before/after file sizes and savings") - print("- Allow you to visually compare original vs optimized") + print("- Allow you to compare original vs optimized files") sys.exit(1) input_path = Path(sys.argv[1]) @@ -183,35 +253,35 @@ def main(): sys.exit(1) # Collect image files - image_extensions = {'.png', '.jpg', '.jpeg', '.heif', '.heic'} + image_extensions = {'.png', '.jpg', '.jpeg', '.heif', '.heic', '.pdf'} if input_path.is_file(): if input_path.suffix.lower() not in image_extensions: - print(f"āŒ Not a supported image file: {input_path}") + print(f"āŒ Not a supported file: {input_path}") sys.exit(1) - image_files = [input_path] + files = [input_path] output_dir = input_path.parent else: - image_files: List[Path] = [] + files: List[Path] = [] for ext in image_extensions: - image_files.extend(input_path.glob(f"*{ext}")) - image_files.extend(input_path.glob(f"*{ext.upper()}")) + files.extend(input_path.glob(f"*{ext}")) + files.extend(input_path.glob(f"*{ext.upper()}")) - if not image_files: - print(f"āŒ No supported image files found in: {input_path}") + if not files: + print(f"āŒ No supported files found in: {input_path}") sys.exit(1) output_dir = input_path - print(f"šŸ” Found {len(image_files)} image(s) to process") + print(f"šŸ” Found {len(files)} file(s) to process") print(f"šŸ“ Output directory: {output_dir}") - # Process each image - for image_file in sorted(image_files): - process_image(image_file, output_dir) + # Process each file + for file in sorted(files): + process_file(file, output_dir) print(f"\n✨ Processing complete!") - print(f"šŸ“‚ Check {output_dir} for optimized images with '_optimized' suffix") - print("šŸ‘€ Open original and optimized images side-by-side to compare quality") + print(f"šŸ“‚ Check {output_dir} for optimized files with '_optimized' suffix") + print("šŸ‘€ Open original and optimized files side-by-side to compare quality") if __name__ == "__main__": diff --git a/src/launchpad/size/insights/apple/image_optimization.py b/src/launchpad/size/insights/apple/image_optimization.py index bb604831..50005af8 100644 --- a/src/launchpad/size/insights/apple/image_optimization.py +++ b/src/launchpad/size/insights/apple/image_optimization.py @@ -8,6 +8,7 @@ from pathlib import Path from typing import Iterable, List, Sequence +import fitz # type: ignore import pillow_heif # type: ignore from PIL import Image @@ -38,7 +39,7 @@ class _OptimizationResult: class ImageOptimizationInsight(Insight[ImageOptimizationInsightResult]): """Analyse image optimisation opportunities in iOS apps.""" - OPTIMIZABLE_FORMATS = {"png", "jpg", "jpeg", "heif", "heic"} + OPTIMIZABLE_FORMATS = {"png", "jpg", "jpeg", "heif", "heic", "pdf"} MIN_SAVINGS_THRESHOLD = 4096 TARGET_JPEG_QUALITY = 85 TARGET_HEIC_QUALITY = 85 @@ -104,8 +105,16 @@ def _analyze_image_optimization( if res := self._check_heic_minification(img, file_size): minify_savings, minified_size = res.savings, res.optimized_size except Exception as exc: - logger.error("Failed to process %s: %s", display_path, exc) - return None + if file_type.lower() == "pdf": + try: + if res := self._check_pdf_optimization(full_path, file_size): + minify_savings, minified_size = res.savings, res.optimized_size + except Exception as pdf_exc: + logger.error("Failed to process PDF %s: %s", display_path, pdf_exc) + return None + else: + logger.error("Failed to process %s: %s", display_path, exc) + return None if max(minify_savings, conversion_savings) < self.MIN_SAVINGS_THRESHOLD: return None @@ -154,6 +163,17 @@ def _check_heic_minification(self, img: Image.Image, file_size: int) -> _Optimiz logger.error("HEIC minification check failed: %s", exc) return None + def _check_pdf_optimization(self, file_path: Path, file_size: int) -> _OptimizationResult | None: + try: + doc = fitz.open(file_path) + optimized_bytes = doc.tobytes(garbage=4, deflate=True) + doc.close() + new_size = len(optimized_bytes) + return _OptimizationResult(file_size - new_size, new_size) if new_size < file_size else None + except Exception as exc: + logger.error("PDF optimization check failed: %s", exc) + return None + def _iter_optimizable_files(self, files: Sequence[FileInfo]) -> Iterable[FileInfo]: for fi in files: if fi.file_type == "car": From 13e815ce1edf4c1b324d82efc2d77a059b016477 Mon Sep 17 00:00:00 2001 From: Trevor Elkins Date: Tue, 29 Jul 2025 16:28:32 -0400 Subject: [PATCH 2/2] update --- requirements.txt | 2 +- scripts/test_image_optimization.py | 33 ++++++++++++++----- .../size/insights/apple/image_optimization.py | 19 ++++++++--- 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/requirements.txt b/requirements.txt index c8af2afa..341c126c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,4 @@ asn1crypto>=1.5.0 cryptography>=41.0.0 datadog==0.51.* lzfse>=0.4.2 -PyMuPDF>=1.23.0 +pypdf>=5.0.0 diff --git a/scripts/test_image_optimization.py b/scripts/test_image_optimization.py index 4fb7469f..dde6732f 100755 --- a/scripts/test_image_optimization.py +++ b/scripts/test_image_optimization.py @@ -11,7 +11,7 @@ from typing import List, NamedTuple import pillow_heif # type: ignore -import fitz # type: ignore +from pypdf import PdfReader, PdfWriter from PIL import Image # Register HEIF support @@ -89,10 +89,19 @@ def test_heic_conversion(img: Image.Image, file_size: int) -> OptimizationResult def test_pdf_optimization(pdf_path: Path, file_size: int) -> OptimizationResult | None: """Test PDF optimization.""" try: - doc = fitz.open(pdf_path) - optimized_bytes = doc.tobytes(garbage=4, deflate=True) - doc.close() - new_size = len(optimized_bytes) + reader = PdfReader(pdf_path) + writer = PdfWriter() + + for page in reader.pages: + page.compress_content_streams(level=9) + writer.add_page(page) + + writer.compress_identical_objects(remove_identicals=True, remove_orphans=True) + + with io.BytesIO() as buf: + writer.write(buf) + new_size = buf.tell() + if new_size < file_size: savings = file_size - new_size return OptimizationResult( @@ -124,11 +133,17 @@ def save_optimized_image(img: Image.Image, output_path: Path, method: str) -> No def save_optimized_pdf(pdf_path: Path, output_path: Path) -> None: """Save the optimized PDF to disk.""" try: - doc = fitz.open(pdf_path) - optimized_bytes = doc.tobytes(garbage=4, deflate=True) - doc.close() + reader = PdfReader(pdf_path) + writer = PdfWriter() + + for page in reader.pages: + page.compress_content_streams(level=9) + writer.add_page(page) + + writer.compress_identical_objects(remove_identicals=True, remove_orphans=True) + with open(output_path, 'wb') as f: - f.write(optimized_bytes) + writer.write(f) except Exception as e: print(f" āŒ Failed to save optimized PDF: {e}") diff --git a/src/launchpad/size/insights/apple/image_optimization.py b/src/launchpad/size/insights/apple/image_optimization.py index 50005af8..8851129a 100644 --- a/src/launchpad/size/insights/apple/image_optimization.py +++ b/src/launchpad/size/insights/apple/image_optimization.py @@ -8,10 +8,10 @@ from pathlib import Path from typing import Iterable, List, Sequence -import fitz # type: ignore import pillow_heif # type: ignore from PIL import Image +from pypdf import PdfReader, PdfWriter from launchpad.size.insights.insight import Insight, InsightsInput from launchpad.size.models.common import FileInfo @@ -165,10 +165,19 @@ def _check_heic_minification(self, img: Image.Image, file_size: int) -> _Optimiz def _check_pdf_optimization(self, file_path: Path, file_size: int) -> _OptimizationResult | None: try: - doc = fitz.open(file_path) - optimized_bytes = doc.tobytes(garbage=4, deflate=True) - doc.close() - new_size = len(optimized_bytes) + reader = PdfReader(file_path) + writer = PdfWriter() + + for page in reader.pages: + page.compress_content_streams(level=9) + writer.add_page(page) + + writer.compress_identical_objects(remove_identicals=True, remove_orphans=True) + + with io.BytesIO() as buf: + writer.write(buf) + new_size = buf.tell() + return _OptimizationResult(file_size - new_size, new_size) if new_size < file_size else None except Exception as exc: logger.error("PDF optimization check failed: %s", exc)