From 4e48e893c3eca97d54c3e7bafdee212cc4c5d4b0 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 10 Sep 2023 10:03:13 +0200 Subject: [PATCH] STY: Move functions within _xobj_to_image to a private module (#2182) This PR aims to improve reading complexity of the code by moving inner functions of `_xobj_to_image` into their own module. It would have been desirable to also move `_xobj_to_image` into that module. However, the fact that `PIL` is optional increases complexity again. For this reason `_xobj_to_image` stays where it is for the moment. --- pypdf/_xobj_image_helpers.py | 251 +++++++++++++++++++++++++++++++++++ pypdf/filters.py | 251 ++--------------------------------- 2 files changed, 259 insertions(+), 243 deletions(-) create mode 100644 pypdf/_xobj_image_helpers.py diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py new file mode 100644 index 000000000..0165004aa --- /dev/null +++ b/pypdf/_xobj_image_helpers.py @@ -0,0 +1,251 @@ +"""Code in here is only used by pypdf.filters._xobj_to_image""" + +from io import BytesIO +from typing import Any, List, Tuple, Union, cast + +from ._utils import logger_warning +from .constants import ColorSpaces +from .errors import PdfReadError +from .generic import ( + ArrayObject, + DecodedStreamObject, + EncodedStreamObject, + IndirectObject, + NullObject, +) + +try: + from typing import Literal, TypeAlias # type: ignore[attr-defined] +except ImportError: + # PEP 586 introduced typing.Literal with Python 3.8 + # For older Python versions, the backport typing_extensions is necessary: + from typing_extensions import Literal, TypeAlias # type: ignore[misc, assignment] + + +try: + from PIL import Image +except ImportError: + raise ImportError( + "pillow is required to do image extraction. " + "It can be installed via 'pip install pypdf[image]'" + ) + +mode_str_type: TypeAlias = Literal[ + "", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK" +] + + +def _get_imagemode( + color_space: Union[str, List[Any], Any], + color_components: int, + prev_mode: mode_str_type, +) -> Tuple[mode_str_type, bool]: + """ + Returns + Image mode not taking into account mask(transparency) + ColorInversion is required (like for some DeviceCMYK) + """ + if isinstance(color_space, NullObject): + return "", False + if isinstance(color_space, str): + pass + elif not isinstance(color_space, list): + raise PdfReadError( + "can not interprete colorspace", color_space + ) # pragma: no cover + elif color_space[0].startswith("/Cal"): # /CalRGB and /CalGray + color_space = "/Device" + color_space[0][4:] + elif color_space[0] == "/ICCBased": + icc_profile = color_space[1].get_object() + color_components = cast(int, icc_profile["/N"]) + color_space = icc_profile.get("/Alternate", "") + elif color_space[0] == "/Indexed": + color_space = color_space[1] + if isinstance(color_space, IndirectObject): + color_space = color_space.get_object() + mode2, invert_color = _get_imagemode(color_space, color_components, prev_mode) + if mode2 in ("RGB", "CMYK"): + mode2 = "P" + return mode2, invert_color + elif color_space[0] == "/Separation": + color_space = color_space[2] + if isinstance(color_space, IndirectObject): + color_space = color_space.get_object() + mode2, invert_color = _get_imagemode(color_space, color_components, prev_mode) + return mode2, True + elif color_space[0] == "/DeviceN": + color_components = len(color_space[1]) + color_space = color_space[2] + if isinstance(color_space, IndirectObject): # pragma: no cover + color_space = color_space.get_object() + + mode_map = { + "1bit": "1", # pos [0] will be used for 1 bit + "/DeviceGray": "L", # must be in pos [1] + "palette": "P", # must be in pos [2] for color_components align. + "/DeviceRGB": "RGB", # must be in pos [3] + "/DeviceCMYK": "CMYK", # must be in pos [4] + "2bit": "2bits", # 2 bits images + "4bit": "4bits", # 4 bits + } + mode: mode_str_type = ( + mode_map.get(color_space) # type: ignore + or list(mode_map.values())[color_components] + or prev_mode + ) # type: ignore + return mode, mode == "CMYK" + + +def _handle_flate( + size: Tuple[int, int], + data: bytes, + mode: mode_str_type, + color_space: str, + colors: int, + obj_as_text: str, +) -> Tuple[Image.Image, str, str, bool]: + """ + Process image encoded in flateEncode + Returns img, image_format, extension, color inversion + """ + + def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: + mask = (2 << bits) - 1 + nbuff = bytearray(size[0] * size[1]) + by = 0 + bit = 8 - bits + for y in range(size[1]): + if (bit != 0) and (bit != 8 - bits): + by += 1 + bit = 8 - bits + for x in range(size[0]): + nbuff[y * size[0] + x] = (data[by] >> bit) & mask + bit -= bits + if bit < 0: + by += 1 + bit = 8 - bits + return bytes(nbuff) + + extension = ".png" # mime_type = "image/png" + image_format = "PNG" + lookup: Any + base: Any + hival: Any + if isinstance(color_space, ArrayObject) and color_space[0] == "/Indexed": + color_space, base, hival, lookup = (value.get_object() for value in color_space) + if mode == "2bits": + mode = "P" + data = bits2byte(data, size, 2) + elif mode == "4bits": + mode = "P" + data = bits2byte(data, size, 4) + img = Image.frombytes(mode, size, data) + if color_space == "/Indexed": + from .generic import TextStringObject + + if isinstance(lookup, (EncodedStreamObject, DecodedStreamObject)): + lookup = lookup.get_data() + if isinstance(lookup, TextStringObject): + lookup = lookup.original_bytes + if isinstance(lookup, str): + lookup = lookup.encode() + try: + nb, conv, mode = { # type: ignore + "1": (0, "", ""), + "L": (1, "P", "L"), + "P": (0, "", ""), + "RGB": (3, "P", "RGB"), + "CMYK": (4, "P", "CMYK"), + }[_get_imagemode(base, 0, "")[0]] + except KeyError: # pragma: no cover + logger_warning( + f"Base {base} not coded please share the pdf file with pypdf dev team", + __name__, + ) + lookup = None + else: + if img.mode == "1": + colors_arr = [lookup[x - nb : x] for x in range(nb, len(lookup), nb)] + arr = b"".join( + [ + b"".join( + [ + colors_arr[1 if img.getpixel((x, y)) > 127 else 0] + for x in range(img.size[0]) + ] + ) + for y in range(img.size[1]) + ] + ) + img = Image.frombytes(mode, img.size, arr) + else: + img = img.convert(conv) + if len(lookup) != (hival + 1) * nb: + logger_warning(f"Invalid Lookup Table in {obj_as_text}", __name__) + lookup = None + elif mode == "L": + # gray lookup does not work : it is converted to a similar RGB lookup + lookup = b"".join([bytes([b, b, b]) for b in lookup]) + mode = "RGB" + # TODO : cf https://github.com/py-pdf/pypdf/pull/2039 + # this is a work around until PIL is able to process CMYK images + elif mode == "CMYK": + _rgb = [] + for _c, _m, _y, _k in ( + lookup[n : n + 4] for n in range(0, 4 * (len(lookup) // 4), 4) + ): + _r = int(255 * (1 - _c / 255) * (1 - _k / 255)) + _g = int(255 * (1 - _m / 255) * (1 - _k / 255)) + _b = int(255 * (1 - _y / 255) * (1 - _k / 255)) + _rgb.append(bytes((_r, _g, _b))) + lookup = b"".join(_rgb) + mode = "RGB" + if lookup is not None: + img.putpalette(lookup, rawmode=mode) + img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB") + elif not isinstance(color_space, NullObject) and color_space[0] == "/ICCBased": + # see Table 66 - Additional Entries Specific to an ICC Profile + # Stream Dictionary + mode2 = _get_imagemode(color_space, colors, mode)[0] + if mode != mode2: + img = Image.frombytes(mode2, size, data) # reloaded as mode may have change + if mode == "CMYK": + extension = ".tif" + image_format = "TIFF" + return img, image_format, extension, False + + +def _handle_jpx( + size: Tuple[int, int], + data: bytes, + mode: mode_str_type, + color_space: str, + colors: int, +) -> Tuple[Image.Image, str, str, bool]: + """ + Process image encoded in flateEncode + Returns img, image_format, extension, inversion + """ + extension = ".jp2" # mime_type = "image/x-jp2" + img1 = Image.open(BytesIO(data), formats=("JPEG2000",)) + mode, invert_color = _get_imagemode(color_space, colors, mode) + if mode == "": + mode = cast(mode_str_type, img1.mode) + invert_color = mode in ("CMYK",) + if img1.mode == "RGBA" and mode == "RGB": + mode = "RGBA" + # we need to convert to the good mode + try: + if img1.mode != mode: + img = Image.frombytes(mode, img1.size, img1.tobytes()) + else: + img = img1 + except OSError: + img = Image.frombytes(mode, img1.size, img1.tobytes()) + # for CMYK conversion : + # https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop + # not implemented for the moment as I need to get properly the ICC + if img.mode == "CMYK": + img = img.convert("RGB") + image_format = "JPEG2000" + return img, image_format, extension, invert_color diff --git a/pypdf/filters.py b/pypdf/filters.py index c990c0819..f308a9010 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -56,20 +56,11 @@ from .errors import PdfReadError, PdfStreamError from .generic import ( ArrayObject, - DecodedStreamObject, DictionaryObject, - EncodedStreamObject, IndirectObject, NullObject, ) -try: - from typing import Literal, TypeAlias # type: ignore[attr-defined] -except ImportError: - # PEP 586 introduced typing.Literal with Python 3.8 - # For older Python versions, the backport typing_extensions is necessary: - from typing_extensions import Literal, TypeAlias # type: ignore[misc, assignment] - def decompress(data: bytes) -> bytes: """ @@ -726,72 +717,6 @@ def decodeStreamData(stream: Any) -> Union[str, bytes]: # deprecated return decode_stream_data(stream) -mode_str_type: TypeAlias = Literal[ - "", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK" -] - - -def _get_imagemode( - color_space: Union[str, List[Any], Any], - color_components: int, - prev_mode: mode_str_type, -) -> Tuple[mode_str_type, bool]: - """ - Returns - Image mode not taking into account mask(transparency) - ColorInversion is required (like for some DeviceCMYK) - """ - if isinstance(color_space, NullObject): - return "", False - if isinstance(color_space, str): - pass - elif not isinstance(color_space, list): - raise PdfReadError( - "can not interprete colorspace", color_space - ) # pragma: no cover - elif color_space[0].startswith("/Cal"): # /CalRGB and /CalGray - color_space = "/Device" + color_space[0][4:] - elif color_space[0] == "/ICCBased": - icc_profile = color_space[1].get_object() - color_components = cast(int, icc_profile["/N"]) - color_space = icc_profile.get("/Alternate", "") - elif color_space[0] == "/Indexed": - color_space = color_space[1] - if isinstance(color_space, IndirectObject): - color_space = color_space.get_object() - mode2, invert_color = _get_imagemode(color_space, color_components, prev_mode) - if mode2 in ("RGB", "CMYK"): - mode2 = "P" - return mode2, invert_color - elif color_space[0] == "/Separation": - color_space = color_space[2] - if isinstance(color_space, IndirectObject): - color_space = color_space.get_object() - mode2, invert_color = _get_imagemode(color_space, color_components, prev_mode) - return mode2, True - elif color_space[0] == "/DeviceN": - color_components = len(color_space[1]) - color_space = color_space[2] - if isinstance(color_space, IndirectObject): # pragma: no cover - color_space = color_space.get_object() - - mode_map = { - "1bit": "1", # pos [0] will be used for 1 bit - "/DeviceGray": "L", # must be in pos [1] - "palette": "P", # must be in pos [2] for color_components align. - "/DeviceRGB": "RGB", # must be in pos [3] - "/DeviceCMYK": "CMYK", # must be in pos [4] - "2bit": "2bits", # 2 bits images - "4bit": "4bits", # 4 bits - } - mode: mode_str_type = ( - mode_map.get(color_space) # type: ignore - or list(mode_map.values())[color_components] - or prev_mode - ) # type: ignore - return mode, mode == "CMYK" - - def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, Any]: """ Users need to have the pillow package installed. @@ -805,174 +730,13 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, Returns: Tuple[file extension, bytes, PIL.Image.Image] """ - try: - from PIL import Image - except ImportError: - raise ImportError( - "pillow is required to do image extraction. " - "It can be installed via 'pip install pypdf[image]'" - ) - - def _handle_flate( - size: Tuple[int, int], - data: bytes, - mode: mode_str_type, - color_space: str, - colors: int, - ) -> Tuple[Image.Image, str, str, bool]: - """ - Process image encoded in flateEncode - Returns img, image_format, extension, color inversion - """ - - def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: - mask = (2 << bits) - 1 - nbuff = bytearray(size[0] * size[1]) - by = 0 - bit = 8 - bits - for y in range(size[1]): - if (bit != 0) and (bit != 8 - bits): - by += 1 - bit = 8 - bits - for x in range(size[0]): - nbuff[y * size[0] + x] = (data[by] >> bit) & mask - bit -= bits - if bit < 0: - by += 1 - bit = 8 - bits - return bytes(nbuff) - - extension = ".png" # mime_type = "image/png" - image_format = "PNG" - lookup: Any - base: Any - hival: Any - if isinstance(color_space, ArrayObject) and color_space[0] == "/Indexed": - color_space, base, hival, lookup = ( - value.get_object() for value in color_space - ) - if mode == "2bits": - mode = "P" - data = bits2byte(data, size, 2) - elif mode == "4bits": - mode = "P" - data = bits2byte(data, size, 4) - img = Image.frombytes(mode, size, data) - if color_space == "/Indexed": - from .generic import TextStringObject - - if isinstance(lookup, (EncodedStreamObject, DecodedStreamObject)): - lookup = lookup.get_data() - if isinstance(lookup, TextStringObject): - lookup = lookup.original_bytes - if isinstance(lookup, str): - lookup = lookup.encode() - try: - nb, conv, mode = { # type: ignore - "1": (0, "", ""), - "L": (1, "P", "L"), - "P": (0, "", ""), - "RGB": (3, "P", "RGB"), - "CMYK": (4, "P", "CMYK"), - }[_get_imagemode(base, 0, "")[0]] - except KeyError: # pragma: no cover - logger_warning( - f"Base {base} not coded please share the pdf file with pypdf dev team", - __name__, - ) - lookup = None - else: - if img.mode == "1": - colors_arr = [ - lookup[x - nb : x] for x in range(nb, len(lookup), nb) - ] - arr = b"".join( - [ - b"".join( - [ - colors_arr[1 if img.getpixel((x, y)) > 127 else 0] - for x in range(img.size[0]) - ] - ) - for y in range(img.size[1]) - ] - ) - img = Image.frombytes(mode, img.size, arr) - else: - img = img.convert(conv) - if len(lookup) != (hival + 1) * nb: - logger_warning( - f"Invalid Lookup Table in {obj_as_text}", __name__ - ) - lookup = None - elif mode == "L": - # gray lookup does not work : it is converted to a similar RGB lookup - lookup = b"".join([bytes([b, b, b]) for b in lookup]) - mode = "RGB" - # TODO : cf https://github.com/py-pdf/pypdf/pull/2039 - # this is a work around until PIL is able to process CMYK images - elif mode == "CMYK": - _rgb = [] - for _c, _m, _y, _k in ( - lookup[n : n + 4] - for n in range(0, 4 * (len(lookup) // 4), 4) - ): - _r = int(255 * (1 - _c / 255) * (1 - _k / 255)) - _g = int(255 * (1 - _m / 255) * (1 - _k / 255)) - _b = int(255 * (1 - _y / 255) * (1 - _k / 255)) - _rgb.append(bytes((_r, _g, _b))) - lookup = b"".join(_rgb) - mode = "RGB" - if lookup is not None: - img.putpalette(lookup, rawmode=mode) - img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB") - elif not isinstance(color_space, NullObject) and color_space[0] == "/ICCBased": - # see Table 66 - Additional Entries Specific to an ICC Profile - # Stream Dictionary - mode2 = _get_imagemode(color_space, colors, mode)[0] - if mode != mode2: - img = Image.frombytes( - mode2, size, data - ) # reloaded as mode may have change - if mode == "CMYK": - extension = ".tif" - image_format = "TIFF" - return img, image_format, extension, False - - def _handle_jpx( - size: Tuple[int, int], - data: bytes, - mode: mode_str_type, - color_space: str, - colors: int, - ) -> Tuple[Image.Image, str, str, bool]: - """ - Process image encoded in flateEncode - Returns img, image_format, extension, inversion - """ - extension = ".jp2" # mime_type = "image/x-jp2" - img1 = Image.open(BytesIO(data), formats=("JPEG2000",)) - mode, invert_color = _get_imagemode(color_space, colors, mode) - if mode == "": - mode = cast(mode_str_type, img1.mode) - invert_color = mode in ("CMYK",) - if img1.mode == "RGBA" and mode == "RGB": - mode = "RGBA" - # we need to convert to the good mode - try: - if img1.mode != mode: - img = Image.frombytes(mode, img1.size, img1.tobytes()) - else: - img = img1 - except OSError: - img = Image.frombytes(mode, img1.size, img1.tobytes()) - # for CMYK conversion : - # https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop - # not implemented for the moment as I need to get properly the ICC - if img.mode == "CMYK": - img = img.convert("RGB") - image_format = "JPEG2000" - return img, image_format, extension, invert_color + from ._xobj_image_helpers import ( + Image, + _get_imagemode, + _handle_flate, + _handle_jpx, + mode_str_type, + ) # for error reporting if ( @@ -1025,6 +789,7 @@ def _handle_jpx( mode, color_space, colors, + obj_as_text, ) elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE, FT.CCITT_FAX_DECODE): # I'm not sure if the following logic is correct.