diff --git a/pypdf/_page.py b/pypdf/_page.py index ea8e6f5a9..50d030250 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -28,7 +28,6 @@ # POSSIBILITY OF SUCH DAMAGE. import math -import re import sys from decimal import Decimal from pathlib import Path @@ -58,7 +57,6 @@ mult, ) from ._utils import ( - WHITESPACES_AS_REGEXP, CompressedTransformationMatrix, File, ImageFile, @@ -82,6 +80,7 @@ NameObject, NullObject, NumberObject, + PdfObject, RectangleObject, StreamObject, ) @@ -335,7 +334,6 @@ def __init__( self.pdf = pdf self.inline_images: Optional[Dict[str, ImageFile]] = None # below Union for mypy but actually Optional[List[str]] - self.inline_images_keys: Optional[List[Union[str, List[str]]]] = None self.indirect_reference = indirect_reference def hash_value_data(self) -> bytes: @@ -439,19 +437,8 @@ def _get_ids_image( return [] else: call_stack.append(_i) - if self.inline_images_keys is None: - content = self._get_contents_as_bytes() or b"" - nb_inlines = 0 - for matching in re.finditer( - WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP, - content, - ): - start_of_string = content[: matching.start()] - if len(re.findall(b"[^\\\\]\\(", start_of_string)) == len( - re.findall(b"[^\\\\]\\)", start_of_string) - ): - nb_inlines += 1 - self.inline_images_keys = [f"~{x}~" for x in range(nb_inlines)] + if self.inline_images is None: + self.inline_images = self._get_inline_images() if obj is None: obj = self if ancest is None: @@ -460,7 +447,7 @@ def _get_ids_image( if PG.RESOURCES not in obj or RES.XOBJECT not in cast( DictionaryObject, obj[PG.RESOURCES] ): - return self.inline_images_keys + return [] if self.inline_images is None else list(self.inline_images.keys()) x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore for o in x_object: @@ -470,7 +457,9 @@ def _get_ids_image( lst.append(o if len(ancest) == 0 else ancest + [o]) else: # is a form with possible images inside lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack)) - return lst + self.inline_images_keys + assert self.inline_images is not None + lst.extend(list(self.inline_images.keys())) + return lst def _get_image( self, @@ -551,6 +540,46 @@ def images(self) -> List[ImageFile]: """ return _VirtualListImages(self._get_ids_image, self._get_image) # type: ignore + def _translate_value_inlineimage(self, k: str, v: PdfObject) -> PdfObject: + """Translate values used in inline image""" + try: + v = NameObject( + { + "/G": "/DeviceGray", + "/RGB": "/DeviceRGB", + "/CMYK": "/DeviceCMYK", + "/I": "/Indexed", + "/AHx": "/ASCIIHexDecode", + "/A85": "/ASCII85Decode", + "/LZW": "/LZWDecode", + "/Fl": "/FlateDecode", + "/RL": "/RunLengthDecode", + "/CCF": "/CCITTFaxDecode", + "/DCT": "/DCTDecode", + "/DeviceGray": "/DeviceGray", + "/DeviceRGB": "/DeviceRGB", + "/DeviceCMYK": "/DeviceCMYK", + "/Indexed": "/Indexed", + "/ASCIIHexDecode": "/ASCIIHexDecode", + "/ASCII85Decode": "/ASCII85Decode", + "/LZWDecode": "/LZWDecode", + "/FlateDecode": "/FlateDecode", + "/RunLengthDecode": "/RunLengthDecode", + "/CCITTFaxDecode": "/CCITTFaxDecode", + "/DCTDecode": "/DCTDecode", + }[cast(str, v)] + ) + except (TypeError, KeyError): + if isinstance(v, NameObject): + # It is a custom name, thus we have to look in resources. + # The only applicable case is for ColorSpace. + try: + res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] + v = cast(DictionaryObject, res)[v] + except KeyError: # for res and v + raise PdfReadError(f"Cannot find resource entry {v} for {k}") + return v + def _get_inline_images(self) -> Dict[str, ImageFile]: """ get inline_images @@ -593,51 +622,39 @@ def _get_inline_images(self) -> Dict[str, ImageFile]: "/Length": len(ii["__streamdata__"]), } for k, v in ii["settings"].items(): - try: - v = NameObject( - { - "/G": "/DeviceGray", - "/RGB": "/DeviceRGB", - "/CMYK": "/DeviceCMYK", - "/I": "/Indexed", - "/AHx": "/ASCIIHexDecode", - "/A85": "/ASCII85Decode", - "/LZW": "/LZWDecode", - "/Fl": "/FlateDecode", - "/RL": "/RunLengthDecode", - "/CCF": "/CCITTFaxDecode", - "/DCT": "/DCTDecode", - }[v] - ) - except (TypeError, KeyError): - if isinstance(v, NameObject): - # it is a custom name : we have to look in resources : - # the only applicable case is for ColorSpace - try: - res = cast(DictionaryObject, self["/Resources"])[ - "/ColorSpace" - ] - v = cast(DictionaryObject, res)[v] - except KeyError: # for res and v - raise PdfReadError( - f"Can not find resource entry {v} for {k}" - ) - init[ - NameObject( - { - "/BPC": "/BitsPerComponent", - "/CS": "/ColorSpace", - "/D": "/Decode", - "/DP": "/DecodeParms", - "/F": "/Filter", - "/H": "/Height", - "/W": "/Width", - "/I": "/Interpolate", - "/Intent": "/Intent", - "/IM": "/ImageMask", - }[k] + if k in {"/Length", "/L"}: # no length is expected + continue + if isinstance(v, list): + v = ArrayObject( + [self._translate_value_inlineimage(k, x) for x in v] ) - ] = v + else: + v = self._translate_value_inlineimage(k, v) + k = NameObject( + { + "/BPC": "/BitsPerComponent", + "/CS": "/ColorSpace", + "/D": "/Decode", + "/DP": "/DecodeParms", + "/F": "/Filter", + "/H": "/Height", + "/W": "/Width", + "/I": "/Interpolate", + "/Intent": "/Intent", + "/IM": "/ImageMask", + "/BitsPerComponent": "/BitsPerComponent", + "/ColorSpace": "/ColorSpace", + "/Decode": "/Decode", + "/DecodeParms": "/DecodeParms", + "/Filter": "/Filter", + "/Height": "/Height", + "/Width": "/Width", + "/Interpolate": "/Interpolate", + "/ImageMask": "/ImageMask", + }[k] + ) + if k not in init: + init[k] = v ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) extension, byte_stream, img = _xobj_to_image(ii["object"]) files[f"~{num}~"] = ImageFile( @@ -934,6 +951,8 @@ def replace_contents( # as a backup solution, we put content as an object although not in accordance with pdf ref # this will be fixed with the _add_object self[NameObject(PG.CONTENTS)] = content + # forces recalculation of inline_images + self.inline_images = None def merge_page( self, page2: "PageObject", expand: bool = False, over: bool = True diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 0a5a66847..ed830d1d7 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -386,7 +386,8 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]: WHITESPACES = (b" ", b"\n", b"\r", b"\t", b"\x00") -WHITESPACES_AS_REGEXP = b"[ \n\r\t\x00]" +WHITESPACES_AS_BYTES = b"".join(WHITESPACES) +WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]" def paeth_predictor(left: int, up: int, up_left: int) -> int: diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index cc0123ff2..45b0c145b 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -29,7 +29,7 @@ try: - from PIL import Image + from PIL import Image, UnidentifiedImageError # noqa: F401 except ImportError: raise ImportError( "pillow is required to do image extraction. " @@ -123,6 +123,24 @@ def _get_imagemode( return mode, mode == "CMYK" +def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: + mask = (1 << bits) - 1 + nbuff = bytearray(size[0] * size[1]) + by = 0 + bit = 8 - bits + for y in range(size[1]): + if (bit != 0) and (bit != 8 - bits): + by += 1 + bit = 8 - bits + for x in range(size[0]): + nbuff[y * size[0] + x] = (data[by] >> bit) & mask + bit -= bits + if bit < 0: + by += 1 + bit = 8 - bits + return bytes(nbuff) + + def _extended_image_frombytes( mode: str, size: Tuple[int, int], data: bytes ) -> Image.Image: @@ -150,24 +168,6 @@ def _handle_flate( Process image encoded in flateEncode Returns img, image_format, extension, color inversion """ - - def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: - mask = (2 << bits) - 1 - nbuff = bytearray(size[0] * size[1]) - by = 0 - bit = 8 - bits - for y in range(size[1]): - if (bit != 0) and (bit != 8 - bits): - by += 1 - bit = 8 - bits - for x in range(size[0]): - nbuff[y * size[0] + x] = (data[by] >> bit) & mask - bit -= bits - if bit < 0: - by += 1 - bit = 8 - bits - return bytes(nbuff) - extension = ".png" # mime_type = "image/png" image_format = "PNG" lookup: Any diff --git a/pypdf/filters.py b/pypdf/filters.py index d573ae30e..069a3d023 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -37,10 +37,12 @@ import math import struct import zlib +from base64 import a85decode from io import BytesIO from typing import Any, Dict, List, Optional, Tuple, Union, cast from ._utils import ( + WHITESPACES_AS_BYTES, b_, deprecate_with_replacement, deprecation_no_replacement, @@ -467,7 +469,7 @@ def decode( Decode an LZW encoded data stream. Args: - data: bytes`` or ``str`` text to decode. + data: ``bytes`` or ``str`` text to decode. decode_parms: a dictionary of parameter values. Returns: @@ -487,29 +489,20 @@ def decode( decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: - # decode_parms is unused here + """ + Decode an Ascii85 encoded data stream. + + Args: + data: ``bytes`` or ``str`` text to decode. + decode_parms: a dictionary of parameter values. + Returns: + decoded data. + """ if isinstance(data, str): - data = data.encode("ascii") - group_index = b = 0 - out = bytearray() - for char in data: - if ord("!") <= char <= ord("u"): - group_index += 1 - b = b * 85 + (char - 33) - if group_index == 5: - out += struct.pack(b">L", b) - group_index = b = 0 - elif char == ord("z"): - assert group_index == 0 - out += b"\0\0\0\0" - elif char == ord("~"): - if group_index: - for _ in range(5 - group_index): - b = b * 85 + 84 - out += struct.pack(b">L", b)[: group_index - 1] - break - return bytes(out) + data = data.encode() + data = data.strip(WHITESPACES_AS_BYTES) + return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES) class DCTDecode: @@ -742,6 +735,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, """ from ._xobj_image_helpers import ( Image, + UnidentifiedImageError, _extended_image_frombytes, _get_imagemode, _handle_flate, @@ -808,13 +802,16 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, # I'm not sure if the following logic is correct. # There might not be any relationship between the filters and the # extension - if x_object_obj[SA.FILTER] in [[FT.LZW_DECODE], [FT.CCITT_FAX_DECODE]]: + if lfilters in (FT.LZW_DECODE, FT.CCITT_FAX_DECODE): extension = ".tiff" # mime_type = "image/tiff" image_format = "TIFF" else: extension = ".png" # mime_type = "image/png" image_format = "PNG" - img = Image.open(BytesIO(data), formats=("TIFF", "PNG")) + try: + img = Image.open(BytesIO(data), formats=("TIFF", "PNG")) + except UnidentifiedImageError: + img = _extended_image_frombytes(mode, size, data) elif lfilters == FT.DCT_DECODE: img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg" # invert_color kept unchanged diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 04aebd60e..1688d5d5c 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -33,6 +33,7 @@ import re import sys from io import BytesIO +from math import ceil from typing import ( Any, Callable, @@ -81,6 +82,13 @@ TextStringObject, ) from ._fit import Fit +from ._image_inline import ( + extract_inline_A85, + extract_inline_AHx, + extract_inline_DCT, + extract_inline_default, + extract_inline_RL, +) from ._utils import read_hex_string_from_stream, read_string_from_stream if sys.version_info >= (3, 11): @@ -1152,65 +1160,49 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: # left at beginning of ID tmp = stream.read(3) assert tmp[:2] == b"ID" - data = BytesIO() - # Read the inline image, while checking for EI (End Image) operator. - while True: - # Read 8 kB at a time and check if the chunk contains the E operator. - buf = stream.read(8192) - # We have reached the end of the stream, but haven't found the EI operator. - if not buf: - raise PdfReadError("Unexpected end of stream") - loc = buf.find( - b"E" - ) # we can not look straight for "EI" because it may not have been loaded in the buffer - - if loc == -1: - data.write(buf) + filtr = settings.get("/F", settings.get("/Filter", "not set")) + savpos = stream.tell() + if isinstance(filtr, list): + filtr = filtr[0] # used forencoding + if "AHx" in filtr or "ASCIIHexDecode" in filtr: + data = extract_inline_AHx(stream) + elif "A85" in filtr or "ASCII85Decode" in filtr: + data = extract_inline_A85(stream) + elif "RL" in filtr or "RunLengthDecode" in filtr: + data = extract_inline_RL(stream) + elif "DCT" in filtr or "DCTDecode" in filtr: + data = extract_inline_DCT(stream) + elif filtr == "not set": + cs = settings.get("/CS", "") + if "RGB" in cs: + lcs = 3 + elif "CMYK" in cs: + lcs = 4 else: - # Write out everything before the E. - data.write(buf[0:loc]) - - # Seek back in the stream to read the E next. - stream.seek(loc - len(buf), 1) - tok = stream.read(1) # E of "EI" - # Check for End Image - tok2 = stream.read(1) # I of "EI" - if tok2 != b"I": - stream.seek(-1, 1) - data.write(tok) - continue - # for further debug : print("!!!!",buf[loc-1:loc+10]) - info = tok + tok2 - tok3 = stream.read( - 1 - ) # possible space after "EI" may not been loaded in buf - if tok3 not in WHITESPACES: - stream.seek(-2, 1) # to step back on I - data.write(tok) - elif buf[loc - 1 : loc] in WHITESPACES: # and tok3 in WHITESPACES: - # Data can contain [\s]EI[\s]: 4 chars sufficient, checking Q operator not required. - while tok3 in WHITESPACES: - # needed ???? : info += tok3 - tok3 = stream.read(1) - stream.seek(-1, 1) - # we do not insert EI - break - else: # buf[loc - 1 : loc] not in WHITESPACES and tok3 in WHITESPACES: - # Data can contain [!\s]EI[\s], so check for Q or EMC operator is required to have 4 chars. - while tok3 in WHITESPACES: - info += tok3 - tok3 = stream.read(1) - stream.seek(-1, 1) - if tok3 == b"Q": - break - elif tok3 == b"E": - ope = stream.read(3) - stream.seek(-3, 1) - if ope == b"EMC": - break - else: - data.write(info) - return {"settings": settings, "data": data.getvalue()} + bits = settings.get( + "/BPC", + 8 if cs in {"/I", "/G", "/Indexed", "/DeviceGray"} else -1, + ) + if bits > 0: + lcs = bits / 8.0 + else: + data = extract_inline_default(stream) + lcs = -1 + if lcs > 0: + data = stream.read( + ceil(cast(int, settings["/W"]) * lcs) * cast(int, settings["/H"]) + ) + ei = read_non_whitespace(stream) + stream.seek(-1, 1) + else: + data = extract_inline_default(stream) + + ei = stream.read(3) + stream.seek(-1, 1) + if ei[0:2] != b"EI" or ei[2:3] not in WHITESPACES: + stream.seek(savpos, 0) + data = extract_inline_default(stream) + return {"settings": settings, "data": data} # This overrides the parent method: def get_data(self) -> bytes: diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py new file mode 100644 index 000000000..41826ac31 --- /dev/null +++ b/pypdf/generic/_image_inline.py @@ -0,0 +1,235 @@ +# Copyright (c) 2024, pypdf contributors +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import logging +from io import BytesIO + +from .._utils import ( + WHITESPACES, + StreamType, + read_non_whitespace, +) +from ..errors import PdfReadError + +logger = logging.getLogger(__name__) + +BUFFER_SIZE = 8192 + + +def extract_inline_AHx(stream: StreamType) -> bytes: + """ + Extract HexEncoded Stream from Inline Image. + the stream will be moved onto the EI + """ + data_out: bytes = b"" + # Read data until delimiter > and EI as backup + # ignoring backup. + while True: + data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) + if not data_buffered: + raise PdfReadError("Unexpected end of stream") + pos_tok = data_buffered.find(b">") + if pos_tok >= 0: # found > + data_out += data_buffered[: (pos_tok + 1)] + stream.seek(-len(data_buffered) + pos_tok + 1, 1) + break + pos_ei = data_buffered.find(b"EI") + if pos_ei >= 0: # found EI + stream.seek(-len(data_buffered) + pos_ei - 1, 1) + c = stream.read(1) + while c in WHITESPACES: + stream.seek(-2, 1) + c = stream.read(1) + pos_ei -= 1 + data_out += data_buffered[:pos_ei] + break + elif len(data_buffered) == 2: + data_out += data_buffered + raise PdfReadError("Unexpected end of stream") + else: # > nor EI found + data_out += data_buffered[:-2] + stream.seek(-2, 1) + + ei_tok = read_non_whitespace(stream) + ei_tok += stream.read(2) + stream.seek(-3, 1) + if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): + raise PdfReadError("EI stream not found") + return data_out + + +def extract_inline_A85(stream: StreamType) -> bytes: + """ + Extract A85 Stream from Inline Image. + the stream will be moved onto the EI + """ + data_out: bytes = b"" + # Read data up to delimiter ~> + # see §3.3.2 from PDF ref 1.7 + while True: + data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) + if not data_buffered: + raise PdfReadError("Unexpected end of stream") + pos_tok = data_buffered.find(b"~>") + if pos_tok >= 0: # found! + data_out += data_buffered[: pos_tok + 2] + stream.seek(-len(data_buffered) + pos_tok + 2, 1) + break + elif len(data_buffered) == 2: # end of buffer + data_out += data_buffered + raise PdfReadError("Unexpected end of stream") + data_out += data_buffered[ + :-2 + ] # back by one char in case of in the middle of ~> + stream.seek(-2, 1) + + ei_tok = read_non_whitespace(stream) + ei_tok += stream.read(2) + stream.seek(-3, 1) + if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): + raise PdfReadError("EI stream not found") + return data_out + + +def extract_inline_RL(stream: StreamType) -> bytes: + """ + Extract RL Stream from Inline Image. + the stream will be moved onto the EI + """ + data_out: bytes = b"" + # Read data up to delimiter ~> + # see §3.3.4 from PDF ref 1.7 + while True: + data_buffered = stream.read(BUFFER_SIZE) + if not data_buffered: + raise PdfReadError("Unexpected end of stream") + pos_tok = data_buffered.find(b"\x80") + if pos_tok >= 0: # found + data_out += data_buffered[: pos_tok + 1] + stream.seek(-len(data_buffered) + pos_tok + 1, 1) + break + data_out += data_buffered + + ei_tok = read_non_whitespace(stream) + ei_tok += stream.read(2) + stream.seek(-3, 1) + if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): + raise PdfReadError("EI stream not found") + return data_out + + +def extract_inline_DCT(stream: StreamType) -> bytes: + """ + Extract DCT (JPEG) Stream from Inline Image. + the stream will be moved onto the EI + """ + data_out: bytes = b"" + # Read Blocks of data (ID/Size/data) up to ID=FF/D9 + # see https://www.digicamsoft.com/itu/itu-t81-36.html + notfirst = False + while True: + c = stream.read(1) + if notfirst or (c == b"\xff"): + data_out += c + if c != b"\xff": + continue + else: + notfirst = True + c = stream.read(1) + data_out += c + if c == b"\xff": + stream.seek(-1, 1) # pragma: no cover + elif c == b"\x00": # stuffing + pass + elif c == b"\xd9": # end + break + elif c in ( + b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf" + b"\xda\xdb\xdc\xdd\xde\xdf" + b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe" + ): + c = stream.read(2) + data_out += c + sz = c[0] * 256 + c[1] + data_out += stream.read(sz - 2) + # else: pass + + ei_tok = read_non_whitespace(stream) + ei_tok += stream.read(2) + stream.seek(-3, 1) + if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): + raise PdfReadError("EI stream not found") + return data_out + + +def extract_inline_default(stream: StreamType) -> bytes: + """ + Legacy method + used by default + """ + stream_out = BytesIO() + # Read the inline image, while checking for EI (End Image) operator. + while True: + data_buffered = stream.read(BUFFER_SIZE) + if not data_buffered: + raise PdfReadError("Unexpected end of stream") + pos_ei = data_buffered.find( + b"E" + ) # we can not look straight for "EI" because it may not have been loaded in the buffer + + if pos_ei == -1: + stream_out.write(data_buffered) + else: + # Write out everything including E (the one from EI to be removed). + stream_out.write(data_buffered[0 : pos_ei + 1]) + sav_pos_ei = stream_out.tell() - 1 + # Seek back in the stream to read the E next. + stream.seek(pos_ei + 1 - len(data_buffered), 1) + saved_pos = stream.tell() + # Check for End Image + tok2 = stream.read(1) # I of "EI" + if tok2 != b"I": + stream.seek(saved_pos, 0) + continue + tok3 = stream.read(1) # possible space after "EI" + if tok3 not in WHITESPACES: + stream.seek(saved_pos, 0) + continue + while tok3 in WHITESPACES: + tok3 = stream.read(1) + if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in { + b"Q", + b"E", + }: # for Q ou EMC + stream.seek(saved_pos, 0) + continue + # Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficients + # remove E(I) wrongly inserted earlier + stream_out.truncate(sav_pos_ei) + break + + return stream_out.getvalue() diff --git a/tests/test_filters.py b/tests/test_filters.py index d3980be0b..146ce43cb 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -147,11 +147,10 @@ def test_decode_ahx(): _ = list(p.images.keys()) -@pytest.mark.xfail() def test_ascii85decode_with_overflow(): inputs = ( v + "~>" - for v in "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0e\x0f" + for v in "\x01\x02\x03\x04\x05\x06\x07\x08\x0e\x0f" "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a" "\x1b\x1c\x1d\x1e\x1fvwxy{|}~\x7f\x80\x81\x82" "\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d" @@ -161,9 +160,8 @@ def test_ascii85decode_with_overflow(): ) for i in inputs: - with pytest.raises(ValueError) as exc: + with pytest.raises(ValueError): ASCII85Decode.decode(i) - assert exc.value.args[0] == "" def test_ascii85decode_five_zero_bytes(): @@ -183,10 +181,10 @@ def test_ascii85decode_five_zero_bytes(): b"\x00\x00\x00\x00" * 3, ) - assert ASCII85Decode.decode("!!!!!") == ASCII85Decode.decode("z") + assert ASCII85Decode.decode("!!!!!~>") == ASCII85Decode.decode("z~>") for expected, i in zip(exp_outputs, inputs): - assert ASCII85Decode.decode(i) == expected + assert ASCII85Decode.decode(i + "~>") == expected def test_ccitparameters(): diff --git a/tests/test_generic.py b/tests/test_generic.py index 24da063a2..f59c559e0 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1,5 +1,6 @@ """Test the pypdf.generic module.""" +from base64 import a85encode from copy import deepcopy from io import BytesIO from pathlib import Path @@ -15,6 +16,8 @@ ArrayObject, BooleanObject, ByteStringObject, + ContentStream, + DecodedStreamObject, Destination, DictionaryObject, Fit, @@ -35,6 +38,12 @@ read_object, read_string_from_stream, ) +from pypdf.generic._image_inline import ( + extract_inline_A85, + extract_inline_AHx, + extract_inline_DCT, + extract_inline_RL, +) from . import ReaderDummy, get_data_from_url @@ -883,7 +892,7 @@ def test_annotation_builder_highlight(pdf_file_path): FloatObject(705.4493), ] ), - printing=False + printing=False, ) writer.add_annotation(0, highlight_annotation) for annot in writer.pages[0]["/Annots"]: @@ -910,7 +919,7 @@ def test_annotation_builder_highlight(pdf_file_path): FloatObject(705.4493), ] ), - printing=True + printing=True, ) writer.add_annotation(1, highlight_annotation) for annot in writer.pages[1]["/Annots"]: @@ -1350,3 +1359,92 @@ def test_array_operators(): la = len(a) a -= 300 assert len(a) == la + + +def test_unitary_extract_inline_buffer_invalid(): + with pytest.raises(PdfReadError): + extract_inline_AHx(BytesIO()) + with pytest.raises(PdfReadError): + extract_inline_AHx(BytesIO(4095 * b"00" + b" ")) + with pytest.raises(PdfReadError): + extract_inline_AHx(BytesIO(b"00")) + with pytest.raises(PdfReadError): + extract_inline_A85(BytesIO()) + with pytest.raises(PdfReadError): + extract_inline_A85(BytesIO(a85encode(b"1"))) + with pytest.raises(PdfReadError): + extract_inline_A85(BytesIO(a85encode(b"1") + b"~> Q")) + with pytest.raises(PdfReadError): + extract_inline_A85(BytesIO(a85encode(b"1234578" * 990))) + with pytest.raises(PdfReadError): + extract_inline_RL(BytesIO()) + with pytest.raises(PdfReadError): + extract_inline_RL(BytesIO(b"\x01\x01\x80")) + with pytest.raises(PdfReadError): + extract_inline_DCT(BytesIO(b"\xFF\xD9")) + + +def test_unitary_extract_inline(): + # AHx + b = 16000 * b"00" + assert len(extract_inline_AHx(BytesIO(b + b" EI"))) == len(b) + with pytest.raises(PdfReadError): + extract_inline_AHx(BytesIO(b + b"> ")) + # RL + b = 8200 * b"\x00\xAB" + b"\x80" + assert len(extract_inline_RL(BytesIO(b + b" EI"))) == len(b) + + # default + # EIDD instead of EI; using A85 + b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm +BI\n/W 16 /H 16 /BPC 8 /CS /RGB /F [/A85 /Fl]\nID +Gar8O(o6*is8QV#;;JAuTq2lQ8J;%6#\'d5b"Q[+ZD?\'\\+CGj9~> +EIDD +Q\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" + ec = DecodedStreamObject() + ec.set_data(b) + co = ContentStream(ec, None) + with pytest.raises(PdfReadError) as exc: + co.operations + assert "EI stream not found" in exc.value.args[0] + # EIDD instead of EI; using /Fl (default extraction) + b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm +BI\n/W 16 /H 16 /BPC 8 /CS /RGB /F /Fl \nID +Gar8O(o6*is8QV#;;JAuTq2lQ8J;%6#\'d5b"Q[+ZD?\'\\+CGj9~> +EIDD +Q\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" + ec = DecodedStreamObject() + ec.set_data(b) + co = ContentStream(ec, None) + with pytest.raises(PdfReadError) as exc: + co.operations + assert "Unexpected end of stream" in exc.value.args[0] + + b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm +BI\n/W 16 /H 16 /BPC 8 /CS /RGB /F /Fl \nID +Gar8O(o6*is8QV#;;JAuTq2lQ8J;%6#\'d5b"Q[+ZD?\'\\+CGj9~>EI +BT\nQ\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" + ec = DecodedStreamObject() + ec.set_data(b) + co = ContentStream(ec, None) + with pytest.raises(PdfReadError) as exc: + co.operations + assert "Unexpected end of stream" in exc.value.args[0] + + b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm +BI\n/W 4 /H 4 /CS /G \nID +abcdefghijklmnopEI +Q\nQ\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" + ec = DecodedStreamObject() + ec.set_data(b) + co = ContentStream(ec, None) + assert co.operations[7][0]["data"] == b"abcdefghijklmnop" + + b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm +BI\n/W 4 /H 4 \nID +abcdefghijklmnopEI +Q\nQ\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" + ec = DecodedStreamObject() + ec.set_data(b) + co = ContentStream(ec, None) + assert co.operations[7][0]["data"] == b"abcdefghijklmnop" diff --git a/tests/test_images.py b/tests/test_images.py index e77090171..0dbc3956e 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -13,7 +13,7 @@ import pytest from PIL import Image, ImageChops, ImageDraw -from pypdf import PageObject, PdfReader +from pypdf import PageObject, PdfReader, PdfWriter from pypdf.generic import NameObject, NullObject from . import get_data_from_url @@ -352,5 +352,76 @@ def test_corrupted_jpeg_iss2266(pdf, pdf_name, images, images_name, filtr): @pytest.mark.timeout(30) def test_large_compressed_image(): url = "https://github.com/py-pdf/pypdf/files/15306199/file_with_large_compressed_image.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name="file_with_large_compressed_image.pdf"))) + reader = PdfReader( + BytesIO(get_data_from_url(url, name="file_with_large_compressed_image.pdf")) + ) list(reader.pages[0].images) + + +@pytest.mark.enable_socket() +def test_inline_image_extraction(): + """Cf #2598""" + url = "https://github.com/py-pdf/pypdf/files/14982414/lebo102.pdf" + name = "iss2598.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + # there is no error because images are correctly extracted + reader.pages[1].extract_text() + reader.pages[2].extract_text() + reader.pages[3].extract_text() + + url = "https://github.com/py-pdf/pypdf/files/15210011/Pages.62.73.from.0560-22_WSP.Plan_July.2022_Version.1.pdf" + name = "iss2598a.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader.pages[0].extract_text() + reader.pages[1].extract_text() + + url = "https://github.com/mozilla/pdf.js/raw/master/test/pdfs/issue14256.pdf" + name = "iss2598b.pdf" + writer = PdfWriter(BytesIO(get_data_from_url(url, name=name))) + url = "https://github.com/py-pdf/pypdf/assets/4083478/71bc5053-cfc7-44ba-b7be-8e2333e2c749" + name = "iss2598b.png" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + for i in range(8): + assert image_similarity(writer.pages[0].images[i].image, img) == 1 + writer.pages[0].extract_text() + # check recalculation of inline images + assert writer.pages[0].inline_images is not None + writer.pages[0].merge_scaled_page(writer.pages[0], 0.25) + assert writer.pages[0].inline_images is None + reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") + writer.pages[0].merge_page(reader.pages[0]) + assert list(writer.pages[0].images.keys()) == [ + "/Im0", + "~0~", + "~1~", + "~2~", + "~3~", + "~4~", + "~5~", + "~6~", + "~7~", + "~8~", + "~9~", + "~10~", + "~11~", + "~12~", + "~13~", + "~14~", + "~15~", + ] + + url = "https://github.com/py-pdf/pypdf/files/15233597/bug1065245.pdf" + name = "iss2598c.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = "https://github.com/py-pdf/pypdf/assets/4083478/bfb221be-11bd-46fe-8129-55a58088a4b6" + name = "iss2598c.jpg" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + assert image_similarity(reader.pages[0].images[0].image, img) >= 0.99 + + url = "https://github.com/py-pdf/pypdf/files/15282904/tt.pdf" + name = "iss2598d.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = "https://github.com/py-pdf/pypdf/assets/4083478/1a770e1b-9ad2-4125-89ae-6069992dda23" + name = "iss2598d.png" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + assert image_similarity(reader.pages[0].images[0].image, img) == 1 diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 94e380dca..93bc0c9e5 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -935,9 +935,7 @@ def test_extra_test_iss1541(): stream = BytesIO() cs.write_to_stream(stream) stream.seek(0) - with pytest.raises(PdfReadError) as exc: - ContentStream(read_object(stream, None, None), None, None).operations - assert exc.value.args[0] == "Unexpected end of stream" + ContentStream(read_object(stream, None, None), None, None).operations b = BytesIO(data.getbuffer()) reader = PdfReader( @@ -1025,6 +1023,11 @@ def test_inline_images(): with pytest.raises(KeyError) as exc: reader.pages[2]._get_image(("test",)) + url = "https://github.com/py-pdf/pypdf/files/15233597/bug1065245.pdf" + name = "iss2598c.pdf" # test data also used in test_images.py/test_inline_image_extraction() + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert len(reader.pages[0].images) == 3 + @pytest.mark.enable_socket() def test_iss():