Increase pdfminer's bufsiz to mitigate token splitting issue

Fixes #1361
ocrmypdf · Jul 31, 2024 · d35d008 · d35d008
1 parent f5662d5
commit d35d008
Showing 1 changed file with 5 additions and 3 deletions.
diff --git a/src/ocrmypdf/pdfinfo/layout.py b/src/ocrmypdf/pdfinfo/layout.py
@@ -17,6 +17,7 @@
 import pdfminer.encodingdb
 import pdfminer.pdfdevice
 import pdfminer.pdfinterp
+import pdfminer.psparser
 from pdfminer.converter import PDFLayoutAnalyzer
 from pdfminer.layout import LAParams, LTChar, LTPage, LTTextBox
 from pdfminer.pdfcolor import PDFColorSpace
@@ -58,9 +59,10 @@ def pdfsimplefont__init__(
 
 setattr(PDFSimpleFont, '__init__', pdfsimplefont__init__)
 
-#
-# pdfminer patches when creator is PScript5.dll
-#
+# Patch pdfminer.six buffer size
+# The parser doesn't properly handle keyword tokens are split across the end of the
+# buffer, so increase the buffer size something far larger than will ever be seen.
+pdfminer.psparser.PSBaseParser.BUFSIZ = 256 * 1024 * 1024
 
 
 def pdftype3font__pscript5_get_height(self):