Skip to content

Commit

Permalink
Merge pull request #72 from robertknight/hocr
Browse files Browse the repository at this point in the history
Add option to get output in hOCR format
  • Loading branch information
robertknight authored Jan 27, 2023
2 parents b75605f + e043eec commit acc939d
Show file tree
Hide file tree
Showing 8 changed files with 145 additions and 9 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ EMCC_FLAGS =\
-sMODULARIZE=1 \
-sALLOW_MEMORY_GROWTH\
-sMAXIMUM_MEMORY=128MB \
-std=c++20 \
--post-js=src/tesseract-init.js

# Build main WASM binary for browsers that support WASM SIMD.
Expand Down
41 changes: 39 additions & 2 deletions examples/web/ocr-app.js
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ function OCRDemoApp() {
const [wordBoxes, setWordBoxes] = useState([]);
const [orientation, setOrientation] = useState(null);
const [ocrTime, setOCRTime] = useState(null);
const [outputFormat, setOutputFormat] = useState("text");

const canvasRef = useRef(null);

Expand All @@ -137,6 +138,8 @@ function OCRDemoApp() {
const context = canvasRef.current.getContext("2d");
context.drawImage(documentImage, 0, 0);

let cancelled = false;

const doOCR = async () => {
if (!ocrClient.current) {
// Initialize the OCR engine when recognition is performed for the first
Expand All @@ -157,6 +160,9 @@ function OCRDemoApp() {
try {
setStatus("Loading image");
await ocr.loadImage(documentImage);
if (cancelled) {
return;
}

const orientation = await ocr.getOrientation();
setOrientation(orientation);
Expand All @@ -165,14 +171,30 @@ function OCRDemoApp() {
setStatus("Recognizing text");
let boxes = await ocr.getTextBoxes("word", setOCRProgress);
boxes = boxes.filter((box) => box.text.trim() !== "");

if (cancelled) {
return;
}
setWordBoxes(boxes);

const endTime = performance.now();
setOCRTime(Math.round(endTime - startTime));

// Get the text as a single string. This will be quick since OCR has
// already been performed.
const text = await ocr.getText();
let text;
switch (outputFormat) {
case "hocr":
text = await ocr.getHOCR();
break;
case "text":
text = await ocr.getText();
break;
}

if (cancelled) {
return;
}
setDocumentText(text);
} catch (err) {
setError(err);
Expand All @@ -182,7 +204,11 @@ function OCRDemoApp() {
}
};
doOCR();
}, [documentImage]);

return () => {
cancelled = true;
};
}, [documentImage, outputFormat]);

const loadImage = async (file) => {
try {
Expand Down Expand Up @@ -235,6 +261,17 @@ function OCRDemoApp() {
</div>
)}
<FileDropZone onDrop={loadImage} />
<div>
<label htmlFor="output-format">Output format:</label>
<select
id="output-format"
onChange={(e) => setOutputFormat(e.target.value)}
value={outputFormat}
>
<option value="text">Plain text</option>
<option value="hocr">hOCR</option>
</select>
</div>
{status !== null && <div>{status}</div>}
{ocrTime !== null && (
<div>
Expand Down
13 changes: 6 additions & 7 deletions patches/tesseract.diff
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c6845cb..5a948117 100644
index 8c6845cb..fdcfc4a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -90,6 +90,7 @@ option(ENABLE_LTO "Enable link-time optimization" OFF)
Expand All @@ -20,23 +20,22 @@ index 8c6845cb..5a948117 100644
src/wordrec/*.cpp)

if(DISABLED_LEGACY_ENGINE)
@@ -713,14 +712,7 @@ file(

@@ -714,13 +713,7 @@ file(
set(TESSERACT_SRC
${TESSERACT_SRC}
- src/api/baseapi.cpp
src/api/baseapi.cpp
- src/api/capi.cpp
- src/api/renderer.cpp
- src/api/altorenderer.cpp
- src/api/hocrrenderer.cpp
- src/api/lstmboxrenderer.cpp
- src/api/pdfrenderer.cpp
- src/api/wordstrboxrenderer.cpp)
+ src/api/baseapi.cpp)
+ src/api/hocrrenderer.cpp)

set(TESSERACT_CONFIGS
tessdata/configs/alto
@@ -858,14 +850,16 @@ endif()
@@ -858,14 +851,16 @@ endif()
# EXECUTABLE tesseract
# ##############################################################################

Expand All @@ -60,7 +59,7 @@ index 8c6845cb..5a948117 100644
endif()

# ##############################################################################
@@ -899,7 +893,11 @@ write_basic_package_version_file(
@@ -899,7 +894,11 @@ write_basic_package_version_file(

install(FILES ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
Expand Down
28 changes: 28 additions & 0 deletions src/lib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <tesseract/baseapi.h>
#include <tesseract/ocrclass.h>

#include <format>
#include <memory>
#include <string>
#include <vector>
Expand Down Expand Up @@ -212,6 +213,32 @@ class OCREngine {
return string_from_raw(tesseract_->GetUTF8Text());
}

std::string GetHOCR(const emscripten::val& progress_callback) {
DoOCR(progress_callback);
auto hocr_body = string_from_raw(tesseract_->GetHOCRText(0));

// The header and footer of the hOCR document are taken from
// `TessHOcrRenderer::BeginDocumentHandler` and
// `TessHOcrRenderer::EndDocumentHandler` respectively. We can't use that
// class directly because it expects to write to a file.
auto hocr_doc = std::format(R"(<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>hOCR text</title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
<meta name='ocr-system' content='tesseract {}' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf' />
</head>
<body>
{}
</body>
</html>)",
tesseract_->Version(), hocr_body);

return hocr_doc;
}

Orientation GetOrientation() {
// Tesseract's orientation detection is part of the legacy (non-LSTM)
// engine, which is not compiled in to reduce binary size. Hence we use
Expand Down Expand Up @@ -340,6 +367,7 @@ EMSCRIPTEN_BINDINGS(ocrlib) {
.constructor<>()
.function("clearImage", &OCREngine::ClearImage)
.function("getBoundingBoxes", &OCREngine::GetBoundingBoxes)
.function("getHOCR", &OCREngine::GetHOCR)
.function("getOrientation", &OCREngine::GetOrientation)
.function("getText", &OCREngine::GetText)
.function("getTextBoxes", &OCREngine::GetTextBoxes)
Expand Down
19 changes: 19 additions & 0 deletions src/ocr-client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,25 @@ export class OCRClient {
}
}

/**
* Perform layout analysis and text recognition on the current image, if
* not already done, and return the image's text in hOCR format (see
* https://en.wikipedia.org/wiki/HOCR).
*/
async getHOCR(onProgress?: ProgressListener): Promise<string> {
const engine = await this._ocrEngine;
if (onProgress) {
this._addProgressListener(onProgress);
}
try {
return await engine.getHOCR();
} finally {
if (onProgress) {
this._removeProgressListener(onProgress);
}
}
}

/**
* Attempt to determine the orientation of the image.
*
Expand Down
16 changes: 16 additions & 0 deletions src/ocr-engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,22 @@ export class OCREngine {
});
}

/**
* Perform layout analysis and text recognition on the current image, if
* not already done, and return the page text in hOCR format.
*
* A text recognition model must be loaded with {@link loadModel} before this
* is called.
*/
getHOCR(onProgress?: ProgressListener): string {
this._checkImageLoaded();
this._checkModelLoaded();
return this._engine.getHOCR((progress: number) => {
onProgress?.(progress);
this._progressChannel?.postMessage({ progress });
});
}

/**
* Attempt to determine the orientation of the document image in degrees.
*
Expand Down
18 changes: 18 additions & 0 deletions test/ocr-client-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,24 @@ describe("OCRClient", () => {
}
});

it("extracts hOCR from image", async function () {
this.timeout(5_000);

const imageData = await loadImage(resolve("./small-test-page.jpg"));
await ocr.loadImage(imageData);

const html = await ocr.getHOCR();

const expectedPhrases = [
"class='ocr_page' id='page_1'",
"<span class='ocrx_word' id='word_1_1' title='bbox 37 233 135 265; x_wconf 93'>Image</span>",
];

for (let phrase of expectedPhrases) {
assert.include(html, phrase);
}
});

it("reports recognition progress", async function () {
this.timeout(5_000);

Expand Down
18 changes: 18 additions & 0 deletions test/ocr-engine-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,24 @@ describe("OCREngine", () => {
}
});

it("extracts hOCR from image", async function () {
this.timeout(5_000);

const imageData = await loadImage(resolve("./small-test-page.jpg"));
ocr.loadImage(imageData);

const html = ocr.getHOCR();

const expectedPhrases = [
"class='ocr_page' id='page_1'",
"<span class='ocrx_word' id='word_1_1' title='bbox 37 233 135 265; x_wconf 93'>Image</span>",
];

for (let phrase of expectedPhrases) {
assert.include(html, phrase);
}
});

it("reports recognition progress", async function () {
this.timeout(5_000);

Expand Down

0 comments on commit acc939d

Please sign in to comment.