diff --git a/pypdf/_page.py b/pypdf/_page.py index e7b47882c..4a022ab8f 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -64,7 +64,6 @@ from .constants import PageAttributes as PG from .constants import Resources as RES from .errors import PageSizeNotDefinedError, PdfReadError -from .filters import _xobj_to_image from .generic import ( ArrayObject, ContentStream, @@ -374,7 +373,7 @@ def replace(self, new_image: Image, **kwargs: Any) -> None: from ._reader import PdfReader # noqa: PLC0415 # to prevent circular import - from .filters import _xobj_to_image # noqa: PLC0415 + from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415 from .generic import DictionaryObject, PdfObject # noqa: PLC0415 if self.indirect_reference is None: @@ -646,6 +645,7 @@ def _get_image( raise KeyError("No inline image can be found") return self.inline_images[id] + from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) extension, byte_stream = imgd[:2] return ImageFile( @@ -749,6 +749,7 @@ def _get_inline_images(self) -> dict[str, ImageFile]: if k not in init: init[k] = v ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) + from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415 extension, byte_stream, img = _xobj_to_image(ii["object"]) files[f"~{num}~"] = ImageFile( name=f"~{num}~{extension}", diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index a9531fab0..cba428109 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -1,13 +1,20 @@ -"""Code in here is only used by pypdf.filters._xobj_to_image""" +"""Functions to convert an image XObject to an image""" import sys from io import BytesIO -from typing import Any, Literal, Union, cast +from typing import Any, Literal, Optional, Union, cast from ._utils import check_if_whitespace_only, logger_warning -from .constants import ColorSpaces -from .constants import FilterTypes as FT -from .constants import ImageAttributes as IA +from .constants import ( + ColorSpaces, + StreamAttributes, +) +from .constants import ( + FilterTypes as FT, +) +from .constants import ( + ImageAttributes as IA, +) from .errors import EmptyImageDataError, PdfReadError from .generic import ( ArrayObject, @@ -23,9 +30,8 @@ else: from typing_extensions import TypeAlias - try: - from PIL import Image, UnidentifiedImageError # noqa: F401 + from PIL import Image, UnidentifiedImageError except ImportError: raise ImportError( "pillow is required to do image extraction. " @@ -187,7 +193,7 @@ def _handle_flate( Process image encoded in flateEncode Returns img, image_format, extension, color inversion """ - extension = ".png" # mime_type = "image/png" + extension = ".png" # mime_type: "image/png" image_format = "PNG" lookup: Any base: Any @@ -297,7 +303,7 @@ def _handle_jpx( Process image encoded in flateEncode Returns img, image_format, extension, inversion """ - extension = ".jp2" # mime_type = "image/x-jp2" + extension = ".jp2" # mime_type: "image/x-jp2" img1 = Image.open(BytesIO(data), formats=("JPEG2000",)) mode, invert_color = _get_imagemode(color_space, colors, mode) if mode == "": @@ -315,7 +321,7 @@ def _handle_jpx( img = Image.frombytes(mode, img1.size, img1.tobytes()) else: # pragma: no cover img = img1.convert(mode) - # for CMYK conversion : + # CMYK conversion: # https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop # not implemented for the moment as I need to get properly the ICC if img.mode == "CMYK": @@ -392,3 +398,167 @@ def _get_mode_and_invert_color( "", ) return mode, invert_color + + +def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any]: + """ + Users need to have the pillow package installed. + + Args: + x_object: + + Returns: + Tuple[file extension, bytes, PIL.Image.Image] + + """ + def _apply_alpha( + img: Image.Image, + x_object: dict[str, Any], + obj_as_text: str, + image_format: str, + extension: str, + ) -> tuple[Image.Image, str, str]: + alpha = None + if IA.S_MASK in x_object: # add alpha channel + alpha = _xobj_to_image(x_object[IA.S_MASK])[2] + if img.size != alpha.size: + logger_warning( + f"image and mask size not matching: {obj_as_text}", __name__ + ) + else: + # TODO: implement mask + if alpha.mode != "L": + alpha = alpha.convert("L") + if img.mode == "P": + img = img.convert("RGB") + elif img.mode == "1": + img = img.convert("L") + img.putalpha(alpha) + if "JPEG" in image_format: + image_format = "JPEG2000" + extension = ".jp2" + else: + image_format = "PNG" + extension = ".png" + return img, extension, image_format + + # For error reporting + obj_as_text = ( + x_object.indirect_reference.__repr__() + if x_object is None # pragma: no cover + else x_object.__repr__() + ) + + # Get size and data + size = (cast(int, x_object[IA.WIDTH]), cast(int, x_object[IA.HEIGHT])) + data = x_object.get_data() # type: ignore + if isinstance(data, str): # pragma: no cover + data = data.encode() + if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A: # ie. '\n' + data = data[:-1] + + # Get color properties + colors = x_object.get("/Colors", 1) + color_space: Any = x_object.get("/ColorSpace", NullObject()).get_object() + if isinstance(color_space, list) and len(color_space) == 1: + color_space = color_space[0].get_object() + + mode, invert_color = _get_mode_and_invert_color(x_object, colors, color_space) + + # Get filters + filters = x_object.get(StreamAttributes.FILTER, NullObject()).get_object() + lfilters = filters[-1] if isinstance(filters, list) else filters + decode_parms = x_object.get(StreamAttributes.DECODE_PARMS, None) + if decode_parms and isinstance(decode_parms, (tuple, list)): + decode_parms = decode_parms[0] + else: + decode_parms = {} + if not isinstance(decode_parms, dict): + decode_parms = {} + + extension = None + if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE): + img, image_format, extension, _ = _handle_flate( + size, + data, + mode, + color_space, + colors, + obj_as_text, + ) + elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE): + # I'm not sure if the following logic is correct. + # There might not be any relationship between the filters and the + # extension + if lfilters == FT.LZW_DECODE: + image_format = "TIFF" + extension = ".tiff" # mime_type: "image/tiff" + else: + image_format = "PNG" + extension = ".png" # mime_type: "image/png" + try: + img = Image.open(BytesIO(data), formats=("TIFF", "PNG")) + except UnidentifiedImageError: + img = _extended_image_frombytes(mode, size, data) + elif lfilters == FT.DCT_DECODE: + img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg" + # invert_color kept unchanged + elif lfilters == FT.JPX_DECODE: + img, image_format, extension, invert_color = _handle_jpx( + size, data, mode, color_space, colors + ) + elif lfilters == FT.CCITT_FAX_DECODE: + img, image_format, extension, invert_color = ( + Image.open(BytesIO(data), formats=("TIFF",)), + "TIFF", + ".tiff", + False, + ) + elif lfilters == FT.JBIG2_DECODE: + img, image_format, extension, invert_color = ( + Image.open(BytesIO(data), formats=("PNG",)), + "PNG", + ".png", + False, + ) + elif mode == "CMYK": + img, image_format, extension, invert_color = ( + _extended_image_frombytes(mode, size, data), + "TIFF", + ".tif", + False, + ) + elif mode == "": + raise PdfReadError(f"ColorSpace field not found in {x_object}") + else: + img, image_format, extension, invert_color = ( + _extended_image_frombytes(mode, size, data), + "PNG", + ".png", + False, + ) + + img = _apply_decode(img, x_object, lfilters, color_space, invert_color) + img, extension, image_format = _apply_alpha( + img, x_object, obj_as_text, image_format, extension + ) + + # Save image to bytes + img_byte_arr = BytesIO() + try: + img.save(img_byte_arr, format=image_format) + except OSError: # pragma: no cover # covered with pillow 10.3 + # in case of we convert to RGBA and then to PNG + img1 = img.convert("RGBA") + image_format = "PNG" + extension = ".png" + img_byte_arr = BytesIO() + img1.save(img_byte_arr, format=image_format) + data = img_byte_arr.getvalue() + + try: # temporary try/except until other fixes of images + img = Image.open(BytesIO(data)) + except Exception as exception: + logger_warning(f"Failed loading image: {exception}", __name__) + img = None # type: ignore + return extension, data, img diff --git a/pypdf/filters.py b/pypdf/filters.py index fc0486555..b4cc89152 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -43,7 +43,6 @@ import zlib from base64 import a85decode from dataclasses import dataclass -from io import BytesIO from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Optional, Union, cast @@ -788,180 +787,3 @@ def decode_stream_data(stream: Any) -> bytes: else: raise NotImplementedError(f"Unsupported filter {filter_name}") return data - - -def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any]: - """ - Users need to have the pillow package installed. - - It's unclear if pypdf will keep this function here, hence it's private. - It might get removed at any point. - - Args: - x_object: - - Returns: - Tuple[file extension, bytes, PIL.Image.Image] - - """ - from ._xobj_image_helpers import ( # noqa: PLC0415 - Image, - UnidentifiedImageError, - _apply_decode, - _extended_image_frombytes, - _get_mode_and_invert_color, - _handle_flate, - _handle_jpx, - ) - - def _apply_alpha( - img: Image.Image, - x_object: dict[str, Any], - obj_as_text: str, - image_format: str, - extension: str, - ) -> tuple[Image.Image, str, str]: - alpha = None - if IA.S_MASK in x_object: # add alpha channel - alpha = _xobj_to_image(x_object[IA.S_MASK])[2] - if img.size != alpha.size: - logger_warning( - f"image and mask size not matching: {obj_as_text}", __name__ - ) - else: - # TODO: implement mask - if alpha.mode != "L": - alpha = alpha.convert("L") - if img.mode == "P": - img = img.convert("RGB") - elif img.mode == "1": - img = img.convert("L") - img.putalpha(alpha) - if "JPEG" in image_format: - image_format = "JPEG2000" - extension = ".jp2" - else: - image_format = "PNG" - extension = ".png" - return img, extension, image_format - - # For error reporting - obj_as_text = ( - x_object.indirect_reference.__repr__() - if x_object is None # pragma: no cover - else x_object.__repr__() - ) - - # Get size and data - size = (cast(int, x_object[IA.WIDTH]), cast(int, x_object[IA.HEIGHT])) - data = x_object.get_data() # type: ignore - if isinstance(data, str): # pragma: no cover - data = data.encode() - if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A: # ie. '\n' - data = data[:-1] - - # Get color properties - colors = x_object.get("/Colors", 1) - color_space: Any = x_object.get("/ColorSpace", NullObject()).get_object() - if isinstance(color_space, list) and len(color_space) == 1: - color_space = color_space[0].get_object() - - mode, invert_color = _get_mode_and_invert_color(x_object, colors, color_space) - - # Get filters - filters = x_object.get(SA.FILTER, NullObject()).get_object() - lfilters = filters[-1] if isinstance(filters, list) else filters - decode_parms = x_object.get(SA.DECODE_PARMS, None) - if decode_parms and isinstance(decode_parms, (tuple, list)): - decode_parms = decode_parms[0] - else: - decode_parms = {} - if not isinstance(decode_parms, dict): - decode_parms = {} - - extension = None - if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE): - img, image_format, extension, _ = _handle_flate( - size, - data, - mode, - color_space, - colors, - obj_as_text, - ) - elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE): - # I'm not sure if the following logic is correct. - # There might not be any relationship between the filters and the - # extension - if lfilters == FT.LZW_DECODE: - image_format = "TIFF" - extension = ".tiff" # mime_type = "image/tiff" - else: - image_format = "PNG" - extension = ".png" # mime_type = "image/png" - try: - img = Image.open(BytesIO(data), formats=("TIFF", "PNG")) - except UnidentifiedImageError: - img = _extended_image_frombytes(mode, size, data) - elif lfilters == FT.DCT_DECODE: - img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg" - # invert_color kept unchanged - elif lfilters == FT.JPX_DECODE: - img, image_format, extension, invert_color = _handle_jpx( - size, data, mode, color_space, colors - ) - elif lfilters == FT.CCITT_FAX_DECODE: - img, image_format, extension, invert_color = ( - Image.open(BytesIO(data), formats=("TIFF",)), - "TIFF", - ".tiff", - False, - ) - elif lfilters == FT.JBIG2_DECODE: - img, image_format, extension, invert_color = ( - Image.open(BytesIO(data), formats=("PNG",)), - "PNG", - ".png", - False, - ) - elif mode == "CMYK": - img, image_format, extension, invert_color = ( - _extended_image_frombytes(mode, size, data), - "TIFF", - ".tif", - False, - ) - elif mode == "": - raise PdfReadError(f"ColorSpace field not found in {x_object}") - else: - img, image_format, extension, invert_color = ( - _extended_image_frombytes(mode, size, data), - "PNG", - ".png", - False, - ) - - img = _apply_decode(img, x_object, lfilters, color_space, invert_color) - img, extension, image_format = _apply_alpha( - img, x_object, obj_as_text, image_format, extension - ) - - # Save image to bytes - img_byte_arr = BytesIO() - try: - img.save(img_byte_arr, format=image_format) - except OSError: # pragma: no cover # covered with pillow 10.3 - # in case of we convert to RGBA and then to PNG - img1 = img.convert("RGBA") - image_format = "PNG" - extension = ".png" - img_byte_arr = BytesIO() - img1.save(img_byte_arr, format=image_format) - data = img_byte_arr.getvalue() - - try: # temporary try/except until other fixes of images - img = Image.open(BytesIO(data)) - except Exception as exception: - logger_warning(f"Failed loading image: {exception}", __name__) - img = None # type: ignore - return extension, data, img diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 6e411ee7c..673a1b98b 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1058,7 +1058,7 @@ def decode_as_image(self) -> Any: stops in your program. """ - from ..filters import _xobj_to_image # noqa: PLC0415 + from .._xobj_image_helpers import _xobj_to_image # noqa: PLC0415 if self.get("/Subtype", "") != "/Image": try: