Move to own modul

py-pdf · Sep 9, 2023 · 63c5c43 · 63c5c43
1 parent ab9ff1e
commit 63c5c43
Show file tree

Hide file tree

Showing 5 changed files with 172 additions and 180 deletions.
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -66,12 +66,12 @@
  logger_warning,
  matrix_multiply,
 )
+from ._xobj_to_image import _xobj_to_image
 from .constants import AnnotationDictionaryAttributes as ADA
 from .constants import ImageAttributes as IA
 from .constants import PageAttributes as PG
 from .constants import Ressources as RES
 from .errors import PageSizeNotDefinedError, PdfReadError
-from .filters import _xobj_to_image
 from .generic import (
  ArrayObject,
  ContentStream,
@@ -1091,7 +1091,7 @@ def _merge_page(
  annots = page[PG.ANNOTS]
  if isinstance(annots, ArrayObject):
  for ref in annots:
- new_annots.append(ref) # noqa: PERF402
+ new_annots.append(ref) # noqa
 
  for res in (
  RES.EXT_G_STATE,

diff --git a/pypdf/_utils.py b/pypdf/_utils.py
@@ -563,7 +563,7 @@ def replace(self, new_image: Any, **kwargs: Any) -> None:
  from ._reader import PdfReader
 
  # to prevent circular import
- from .filters import _xobj_to_image
+ from ._xobj_to_image import _xobj_to_image
  from .generic import DictionaryObject, PdfObject
 
  if self.indirect_reference is None:

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -2954,7 +2954,7 @@ def merge(
  excluded_fields = ()
  # Find the range of pages to merge.
  if pages is None:
- pages = list(range(0, len(reader.pages)))
+ pages = list(range(len(reader.pages)))
  elif isinstance(pages, PageRange):
  pages = list(range(*pages.indices(len(reader.pages))))
  elif isinstance(pages, list):

diff --git a/pypdf/_xobj_image_helpers.py → pypdf/_xobj_to_image.py b/pypdf/_xobj_image_helpers.py → pypdf/_xobj_to_image.py
@@ -1,10 +1,13 @@
 """Code in here is only used by pypdf.filters._xobj_to_image"""
 
 from io import BytesIO
-from typing import Any, List, Tuple, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
 from ._utils import logger_warning
 from .constants import ColorSpaces
+from .constants import FilterTypes as FT
+from .constants import ImageAttributes as IA
+from .constants import StreamAttributes as SA
 from .errors import PdfReadError
 from .generic import (
  ArrayObject,
@@ -35,6 +38,169 @@
 ]
 
 
+def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, Any]:
+ """
+ Users need to have the pillow package installed.
+
+ It's unclear if pypdf will keep this function here, hence it's private.
+ It might get removed at any point.
+
+ Args:
+ x_object_obj:
+
+ Returns:
+ Tuple[file extension, bytes, PIL.Image.Image]
+ """
+ # for error reporting
+ if (
+ hasattr(x_object_obj, "indirect_reference") and x_object_obj is None
+ ): # pragma: no cover
+ obj_as_text = x_object_obj.indirect_reference.__repr__()
+ else:
+ obj_as_text = x_object_obj.__repr__()
+
+ size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT])
+ data = x_object_obj.get_data() # type: ignore
+ if isinstance(data, str): # pragma: no cover
+ data = data.encode()
+ colors = x_object_obj.get("/Colors", 1)
+ color_space: Any = x_object_obj.get("/ColorSpace", NullObject()).get_object()
+ if isinstance(color_space, list) and len(color_space) == 1:
+ color_space = color_space[0].get_object()
+ if (
+ IA.COLOR_SPACE in x_object_obj
+ and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB
+ ):
+ # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
+ mode: mode_str_type = "RGB"
+ if x_object_obj.get("/BitsPerComponent", 8) < 8:
+ mode, invert_color = _get_imagemode(
+ f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, ""
+ )
+ else:
+ mode, invert_color = _get_imagemode(
+ color_space,
+ 2
+ if (
+ colors == 1
+ and (
+ not isinstance(color_space, NullObject)
+ and "Gray" not in color_space
+ )
+ )
+ else colors,
+ "",
+ )
+ extension = None
+ alpha = None
+ filters = x_object_obj.get(SA.FILTER, [None])
+ lfilters = filters[-1] if isinstance(filters, list) else filters
+ if lfilters == FT.FLATE_DECODE:
+ img, image_format, extension, invert_color = _handle_flate(
+ size,
+ data,
+ mode,
+ color_space,
+ colors,
+ obj_as_text,
+ )
+ elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE, FT.CCITT_FAX_DECODE):
+ # I'm not sure if the following logic is correct.
+ # There might not be any relationship between the filters and the
+ # extension
+ if x_object_obj[SA.FILTER] in [[FT.LZW_DECODE], [FT.CCITT_FAX_DECODE]]:
+ extension = ".tiff" # mime_type = "image/tiff"
+ image_format = "TIFF"
+ else:
+ extension = ".png" # mime_type = "image/png"
+ image_format = "PNG"
+ img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
+ elif lfilters == FT.DCT_DECODE:
+ img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg"
+ # invert_color kept unchanged
+ elif lfilters == FT.JPX_DECODE:
+ img, image_format, extension, invert_color = _handle_jpx(
+ size, data, mode, color_space, colors
+ )
+ elif lfilters == FT.CCITT_FAX_DECODE:
+ img, image_format, extension, invert_color = (
+ Image.open(BytesIO(data), formats=("TIFF",)),
+ "TIFF",
+ ".tiff",
+ False,
+ )
+ else:
+ if mode == "":
+ raise PdfReadError(f"ColorSpace field not found in {x_object_obj}")
+ img, image_format, extension, invert_color = (
+ Image.frombytes(mode, size, data),
+ "PNG",
+ ".png",
+ False,
+ )
+
+ # CMYK image and other colorspaces without decode
+ # requires reverting scale (cf p243,2§ last sentence)
+ decode = x_object_obj.get(
+ IA.DECODE,
+ ([1.0, 0.0] * len(img.getbands()))
+ if (
+ (img.mode == "CMYK" or (invert_color and img.mode == "L"))
+ and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE)
+ )
+ else None,
+ )
+ if (
+ isinstance(color_space, ArrayObject)
+ and color_space[0].get_object() == "/Indexed"
+ ):
+ decode = None # decode is meanless of Indexed
+ if decode is not None and not all(decode[i] == i % 2 for i in range(len(decode))):
+ lut: List[int] = []
+ for i in range(0, len(decode), 2):
+ dmin = decode[i]
+ dmax = decode[i + 1]
+ lut.extend(
+ round(255.0 * (j / 255.0 * (dmax - dmin) + dmin)) for j in range(256)
+ )
+ img = img.point(lut)
+
+ if IA.S_MASK in x_object_obj: # add alpha channel
+ alpha = _xobj_to_image(x_object_obj[IA.S_MASK])[2]
+ if img.size != alpha.size:
+ logger_warning(f"image and mask size not matching: {obj_as_text}", __name__)
+ else:
+ # TODO : implement mask
+ if alpha.mode != "L":
+ alpha = alpha.convert("L")
+ if img.mode == "P":
+ img = img.convert("RGB")
+ elif img.mode == "1":
+ img = img.convert("L")
+ img.putalpha(alpha)
+ if "JPEG" in image_format:
+ extension = ".jp2"
+ image_format = "JPEG2000"
+ else:
+ extension = ".png"
+ image_format = "PNG"
+
+ img_byte_arr = BytesIO()
+ try:
+ img.save(img_byte_arr, format=image_format)
+ except OSError: # pragma: no cover
+ # odd error
+ img_byte_arr = BytesIO()
+ img.save(img_byte_arr, format=image_format)
+ data = img_byte_arr.getvalue()
+
+ try: # temporary try/except until other fixes of images
+ img = Image.open(BytesIO(data))
+ except Exception:
+ img = None # type: ignore
+ return extension, data, img
+
+
 def _get_imagemode(
  color_space: Union[str, List[Any], Any],
  color_components: int,