Merge pull request #72 from robertknight/hocr

Add option to get output in hOCR format
robertknight · Jan 27, 2023 · acc939d · acc939d
2 parents b75605f + e043eec
commit acc939d
Show file tree

Hide file tree

Showing 8 changed files with 145 additions and 9 deletions.
diff --git a/Makefile b/Makefile
@@ -165,6 +165,7 @@ EMCC_FLAGS =\
  -sMODULARIZE=1 \
  -sALLOW_MEMORY_GROWTH\
  -sMAXIMUM_MEMORY=128MB \
+ -std=c++20 \
  --post-js=src/tesseract-init.js
 
 # Build main WASM binary for browsers that support WASM SIMD.

diff --git a/examples/web/ocr-app.js b/examples/web/ocr-app.js
@@ -117,6 +117,7 @@ function OCRDemoApp() {
  const [wordBoxes, setWordBoxes] = useState([]);
  const [orientation, setOrientation] = useState(null);
  const [ocrTime, setOCRTime] = useState(null);
+ const [outputFormat, setOutputFormat] = useState("text");
 
  const canvasRef = useRef(null);
 
@@ -137,6 +138,8 @@ function OCRDemoApp() {
  const context = canvasRef.current.getContext("2d");
  context.drawImage(documentImage, 0, 0);
 
+ let cancelled = false;
+
  const doOCR = async () => {
  if (!ocrClient.current) {
  // Initialize the OCR engine when recognition is performed for the first
@@ -157,6 +160,9 @@ function OCRDemoApp() {
  try {
  setStatus("Loading image");
  await ocr.loadImage(documentImage);
+ if (cancelled) {
+ return;
+ }
 
  const orientation = await ocr.getOrientation();
  setOrientation(orientation);
@@ -165,14 +171,30 @@ function OCRDemoApp() {
  setStatus("Recognizing text");
  let boxes = await ocr.getTextBoxes("word", setOCRProgress);
  boxes = boxes.filter((box) => box.text.trim() !== "");
+
+ if (cancelled) {
+ return;
+ }
  setWordBoxes(boxes);
 
  const endTime = performance.now();
  setOCRTime(Math.round(endTime - startTime));
 
  // Get the text as a single string. This will be quick since OCR has
  // already been performed.
- const text = await ocr.getText();
+ let text;
+ switch (outputFormat) {
+ case "hocr":
+ text = await ocr.getHOCR();
+ break;
+ case "text":
+ text = await ocr.getText();
+ break;
+ }
+
+ if (cancelled) {
+ return;
+ }
  setDocumentText(text);
  } catch (err) {
  setError(err);
@@ -182,7 +204,11 @@ function OCRDemoApp() {
  }
  };
  doOCR();
- }, [documentImage]);
+
+ return () => {
+ cancelled = true;
+ };
+ }, [documentImage, outputFormat]);
 
  const loadImage = async (file) => {
  try {
@@ -235,6 +261,17 @@ function OCRDemoApp() {
  </div>
  )}
  <FileDropZone onDrop={loadImage} />
+ <div>
+ <label htmlFor="output-format">Output format:</label>
+ <select
+ id="output-format"
+ onChange={(e) => setOutputFormat(e.target.value)}
+ value={outputFormat}
+ >
+ <option value="text">Plain text</option>
+ <option value="hocr">hOCR</option>
+ </select>
+ </div>
  {status !== null && <div>{status}…</div>}
  {ocrTime !== null && (
  <div>

diff --git a/patches/tesseract.diff b/patches/tesseract.diff
@@ -1,5 +1,5 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 8c6845cb..5a948117 100644
+index 8c6845cb..fdcfc4a8 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -90,6 +90,7 @@ option(ENABLE_LTO "Enable link-time optimization" OFF)
@@ -20,23 +20,22 @@ index 8c6845cb..5a948117 100644
  src/wordrec/*.cpp)
 
  if(DISABLED_LEGACY_ENGINE)
-@@ -713,14 +712,7 @@ file(
-
+@@ -714,13 +713,7 @@ file(
  set(TESSERACT_SRC
  ${TESSERACT_SRC}
-- src/api/baseapi.cpp
+  src/api/baseapi.cpp
 - src/api/capi.cpp
 - src/api/renderer.cpp
 - src/api/altorenderer.cpp
 - src/api/hocrrenderer.cpp
 - src/api/lstmboxrenderer.cpp
 - src/api/pdfrenderer.cpp
 - src/api/wordstrboxrenderer.cpp)
-+ src/api/baseapi.cpp)
++ src/api/hocrrenderer.cpp)
 
  set(TESSERACT_CONFIGS
  tessdata/configs/alto
-@@ -858,14 +850,16 @@ endif()
+@@ -858,14 +851,16 @@ endif()
  # EXECUTABLE tesseract
  # ##############################################################################
 
@@ -60,7 +59,7 @@ index 8c6845cb..5a948117 100644
  endif()
 
  # ##############################################################################
-@@ -899,7 +893,11 @@ write_basic_package_version_file(
+@@ -899,7 +894,11 @@ write_basic_package_version_file(
 
  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc
  DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)

diff --git a/src/lib.cpp b/src/lib.cpp
@@ -4,6 +4,7 @@
 #include <tesseract/baseapi.h>
 #include <tesseract/ocrclass.h>
 
+#include <format>
 #include <memory>
 #include <string>
 #include <vector>
@@ -212,6 +213,32 @@ class OCREngine {
  return string_from_raw(tesseract_->GetUTF8Text());
  }
 
+ std::string GetHOCR(const emscripten::val& progress_callback) {
+ DoOCR(progress_callback);
+ auto hocr_body = string_from_raw(tesseract_->GetHOCRText(0));
+
+ // The header and footer of the hOCR document are taken from
+ // `TessHOcrRenderer::BeginDocumentHandler` and
+ // `TessHOcrRenderer::EndDocumentHandler` respectively. We can't use that
+ // class directly because it expects to write to a file.
+ auto hocr_doc = std::format(R"(<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+ <title>hOCR text</title>
+ <meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
+ <meta name='ocr-system' content='tesseract {}' />
+ <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf' />
+</head>
+<body>
+ {}
+</body>
+</html>)",
+ tesseract_->Version(), hocr_body);
+
+ return hocr_doc;
+ }
+
  Orientation GetOrientation() {
  // Tesseract's orientation detection is part of the legacy (non-LSTM)
  // engine, which is not compiled in to reduce binary size. Hence we use
@@ -340,6 +367,7 @@ EMSCRIPTEN_BINDINGS(ocrlib) {
  .constructor<>()
  .function("clearImage", &OCREngine::ClearImage)
  .function("getBoundingBoxes", &OCREngine::GetBoundingBoxes)
+ .function("getHOCR", &OCREngine::GetHOCR)
  .function("getOrientation", &OCREngine::GetOrientation)
  .function("getText", &OCREngine::GetText)
  .function("getTextBoxes", &OCREngine::GetTextBoxes)

diff --git a/src/ocr-client.ts b/src/ocr-client.ts
@@ -218,6 +218,25 @@ export class OCRClient {
  }
  }
 
+ /**
+ * Perform layout analysis and text recognition on the current image, if
+ * not already done, and return the image's text in hOCR format (see
+ * https://en.wikipedia.org/wiki/HOCR).
+ */
+ async getHOCR(onProgress?: ProgressListener): Promise<string> {
+ const engine = await this._ocrEngine;
+ if (onProgress) {
+ this._addProgressListener(onProgress);
+ }
+ try {
+ return await engine.getHOCR();
+ } finally {
+ if (onProgress) {
+ this._removeProgressListener(onProgress);
+ }
+ }
+ }
+
  /**
  * Attempt to determine the orientation of the image.
  *

diff --git a/src/ocr-engine.ts b/src/ocr-engine.ts
@@ -282,6 +282,22 @@ export class OCREngine {
  });
  }
 
+ /**
+ * Perform layout analysis and text recognition on the current image, if
+ * not already done, and return the page text in hOCR format.
+ *
+ * A text recognition model must be loaded with {@link loadModel} before this
+ * is called.
+ */
+ getHOCR(onProgress?: ProgressListener): string {
+ this._checkImageLoaded();
+ this._checkModelLoaded();
+ return this._engine.getHOCR((progress: number) => {
+ onProgress?.(progress);
+ this._progressChannel?.postMessage({ progress });
+ });
+ }
+
  /**
  * Attempt to determine the orientation of the document image in degrees.
  *

diff --git a/test/ocr-client-test.js b/test/ocr-client-test.js
@@ -88,6 +88,24 @@ describe("OCRClient", () => {
  }
  });
 
+ it("extracts hOCR from image", async function () {
+ this.timeout(5_000);
+
+ const imageData = await loadImage(resolve("./small-test-page.jpg"));
+ await ocr.loadImage(imageData);
+
+ const html = await ocr.getHOCR();
+
+ const expectedPhrases = [
+ "class='ocr_page' id='page_1'",
+ "<span class='ocrx_word' id='word_1_1' title='bbox 37 233 135 265; x_wconf 93'>Image</span>",
+ ];
+
+ for (let phrase of expectedPhrases) {
+ assert.include(html, phrase);
+ }
+ });
+
  it("reports recognition progress", async function () {
  this.timeout(5_000);
 

diff --git a/test/ocr-engine-test.js b/test/ocr-engine-test.js
@@ -286,6 +286,24 @@ describe("OCREngine", () => {
  }
  });
 
+ it("extracts hOCR from image", async function () {
+ this.timeout(5_000);
+
+ const imageData = await loadImage(resolve("./small-test-page.jpg"));
+ ocr.loadImage(imageData);
+
+ const html = ocr.getHOCR();
+
+ const expectedPhrases = [
+ "class='ocr_page' id='page_1'",
+ "<span class='ocrx_word' id='word_1_1' title='bbox 37 233 135 265; x_wconf 93'>Image</span>",
+ ];
+
+ for (let phrase of expectedPhrases) {
+ assert.include(html, phrase);
+ }
+ });
+
  it("reports recognition progress", async function () {
  this.timeout(5_000);