Skip to content

Commit

Permalink
Move to own modul
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma committed Sep 9, 2023
1 parent ab9ff1e commit 63c5c43
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 180 deletions.
4 changes: 2 additions & 2 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,12 @@
logger_warning,
matrix_multiply,
)
from ._xobj_to_image import _xobj_to_image
from .constants import AnnotationDictionaryAttributes as ADA
from .constants import ImageAttributes as IA
from .constants import PageAttributes as PG
from .constants import Ressources as RES
from .errors import PageSizeNotDefinedError, PdfReadError
from .filters import _xobj_to_image
from .generic import (
ArrayObject,
ContentStream,
Expand Down Expand Up @@ -1091,7 +1091,7 @@ def _merge_page(
annots = page[PG.ANNOTS]
if isinstance(annots, ArrayObject):
for ref in annots:
new_annots.append(ref) # noqa: PERF402
new_annots.append(ref) # noqa

for res in (
RES.EXT_G_STATE,
Expand Down
2 changes: 1 addition & 1 deletion pypdf/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ def replace(self, new_image: Any, **kwargs: Any) -> None:
from ._reader import PdfReader

# to prevent circular import
from .filters import _xobj_to_image
from ._xobj_to_image import _xobj_to_image
from .generic import DictionaryObject, PdfObject

if self.indirect_reference is None:
Expand Down
2 changes: 1 addition & 1 deletion pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2954,7 +2954,7 @@ def merge(
excluded_fields = ()
# Find the range of pages to merge.
if pages is None:
pages = list(range(0, len(reader.pages)))
pages = list(range(len(reader.pages)))
elif isinstance(pages, PageRange):
pages = list(range(*pages.indices(len(reader.pages))))
elif isinstance(pages, list):
Expand Down
168 changes: 167 additions & 1 deletion pypdf/_xobj_image_helpers.py → pypdf/_xobj_to_image.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
"""Code in here is only used by pypdf.filters._xobj_to_image"""

from io import BytesIO
from typing import Any, List, Tuple, Union, cast
from typing import Any, Dict, List, Optional, Tuple, Union, cast

from ._utils import logger_warning
from .constants import ColorSpaces
from .constants import FilterTypes as FT
from .constants import ImageAttributes as IA
from .constants import StreamAttributes as SA
from .errors import PdfReadError
from .generic import (
ArrayObject,
Expand Down Expand Up @@ -35,6 +38,169 @@
]


def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, Any]:
"""
Users need to have the pillow package installed.
It's unclear if pypdf will keep this function here, hence it's private.
It might get removed at any point.
Args:
x_object_obj:
Returns:
Tuple[file extension, bytes, PIL.Image.Image]
"""
# for error reporting
if (
hasattr(x_object_obj, "indirect_reference") and x_object_obj is None
): # pragma: no cover
obj_as_text = x_object_obj.indirect_reference.__repr__()
else:
obj_as_text = x_object_obj.__repr__()

size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT])
data = x_object_obj.get_data() # type: ignore
if isinstance(data, str): # pragma: no cover
data = data.encode()
colors = x_object_obj.get("/Colors", 1)
color_space: Any = x_object_obj.get("/ColorSpace", NullObject()).get_object()
if isinstance(color_space, list) and len(color_space) == 1:
color_space = color_space[0].get_object()
if (
IA.COLOR_SPACE in x_object_obj
and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB
):
# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
mode: mode_str_type = "RGB"
if x_object_obj.get("/BitsPerComponent", 8) < 8:
mode, invert_color = _get_imagemode(
f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, ""
)
else:
mode, invert_color = _get_imagemode(
color_space,
2
if (
colors == 1
and (
not isinstance(color_space, NullObject)
and "Gray" not in color_space
)
)
else colors,
"",
)
extension = None
alpha = None
filters = x_object_obj.get(SA.FILTER, [None])
lfilters = filters[-1] if isinstance(filters, list) else filters
if lfilters == FT.FLATE_DECODE:
img, image_format, extension, invert_color = _handle_flate(
size,
data,
mode,
color_space,
colors,
obj_as_text,
)
elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE, FT.CCITT_FAX_DECODE):
# I'm not sure if the following logic is correct.
# There might not be any relationship between the filters and the
# extension
if x_object_obj[SA.FILTER] in [[FT.LZW_DECODE], [FT.CCITT_FAX_DECODE]]:
extension = ".tiff" # mime_type = "image/tiff"
image_format = "TIFF"
else:
extension = ".png" # mime_type = "image/png"
image_format = "PNG"
img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
elif lfilters == FT.DCT_DECODE:
img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg"
# invert_color kept unchanged
elif lfilters == FT.JPX_DECODE:
img, image_format, extension, invert_color = _handle_jpx(
size, data, mode, color_space, colors
)
elif lfilters == FT.CCITT_FAX_DECODE:
img, image_format, extension, invert_color = (
Image.open(BytesIO(data), formats=("TIFF",)),
"TIFF",
".tiff",
False,
)
else:
if mode == "":
raise PdfReadError(f"ColorSpace field not found in {x_object_obj}")
img, image_format, extension, invert_color = (
Image.frombytes(mode, size, data),
"PNG",
".png",
False,
)

# CMYK image and other colorspaces without decode
# requires reverting scale (cf p243,2§ last sentence)
decode = x_object_obj.get(
IA.DECODE,
([1.0, 0.0] * len(img.getbands()))
if (
(img.mode == "CMYK" or (invert_color and img.mode == "L"))
and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE)
)
else None,
)
if (
isinstance(color_space, ArrayObject)
and color_space[0].get_object() == "/Indexed"
):
decode = None # decode is meanless of Indexed
if decode is not None and not all(decode[i] == i % 2 for i in range(len(decode))):
lut: List[int] = []
for i in range(0, len(decode), 2):
dmin = decode[i]
dmax = decode[i + 1]
lut.extend(
round(255.0 * (j / 255.0 * (dmax - dmin) + dmin)) for j in range(256)
)
img = img.point(lut)

if IA.S_MASK in x_object_obj: # add alpha channel
alpha = _xobj_to_image(x_object_obj[IA.S_MASK])[2]
if img.size != alpha.size:
logger_warning(f"image and mask size not matching: {obj_as_text}", __name__)
else:
# TODO : implement mask
if alpha.mode != "L":
alpha = alpha.convert("L")
if img.mode == "P":
img = img.convert("RGB")
elif img.mode == "1":
img = img.convert("L")
img.putalpha(alpha)
if "JPEG" in image_format:
extension = ".jp2"
image_format = "JPEG2000"
else:
extension = ".png"
image_format = "PNG"

img_byte_arr = BytesIO()
try:
img.save(img_byte_arr, format=image_format)
except OSError: # pragma: no cover
# odd error
img_byte_arr = BytesIO()
img.save(img_byte_arr, format=image_format)
data = img_byte_arr.getvalue()

try: # temporary try/except until other fixes of images
img = Image.open(BytesIO(data))
except Exception:
img = None # type: ignore
return extension, data, img


def _get_imagemode(
color_space: Union[str, List[Any], Any],
color_components: int,
Expand Down
Loading

0 comments on commit 63c5c43

Please sign in to comment.