Skip to content

Commit

Permalink
pass DOM parser to epub tool
Browse files Browse the repository at this point in the history
resolves #456
  • Loading branch information
wydengyre committed Mar 13, 2024
1 parent d66016d commit 9abfd74
Show file tree
Hide file tree
Showing 8 changed files with 18 additions and 11 deletions.
3 changes: 2 additions & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion packages/cli/epub-to-text.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ async function main() {

async function go(r: Readable): Promise<string> {
const ab = await arrayBuffer(r);
return epubToText(ab);
return epubToText(new DOMParser(), ab);
}

const currentFile = fileURLToPath(import.meta.url);
Expand Down
1 change: 1 addition & 0 deletions packages/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"dependencies": {
"@bitextual/core": "*",
"@bitextual/epub": "*",
"@xmldom/xmldom": "^0.8.10",
"franc-min": "^6.2.0"
}
}
1 change: 0 additions & 1 deletion packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
},
"dependencies": {
"@bitextual/hunalign": "^0.0.1",
"@xmldom/xmldom": "^0.8.10",
"html-to-text": "^9.0.5",
"jszip": "^3.10.1"
}
Expand Down
5 changes: 3 additions & 2 deletions packages/epub/epub.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { strict as assert } from "node:assert";
import { readFile } from "node:fs/promises";
import { test } from "node:test";
import { fileURLToPath } from "node:url";
import { DOMParser } from "@xmldom/xmldom";
import { epubToText } from "./epub.js";

const EPUB2_PATH_REL = "@bitextual/test/bovary.english.epub";
Expand All @@ -22,12 +23,12 @@ test("epub", async (t) => {

const epubToTextEpub2 = (expected: string) => async () => {
const bytes = await readFile(EPUB2_PATH);
const text = await epubToText(bytes);
const text = await epubToText(new DOMParser(), bytes);
assert.strictEqual(text, expected);
};

const epubToTextEpub3 = (expected: string) => async () => {
const bytes = await readFile(EPUB3_PATH);
const text = await epubToText(bytes);
const text = await epubToText(new DOMParser(), bytes);
assert.strictEqual(text, expected);
};
11 changes: 7 additions & 4 deletions packages/epub/epub.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import { DOMParser } from "@xmldom/xmldom";
import type { DOMParser } from "@xmldom/xmldom";
import { compile as compileHtmlConvert } from "html-to-text";
import JSZip from "jszip";

export { epubToText };

async function epubToText(epubBytes: ArrayBuffer): Promise<string> {
async function epubToText(
domParser: DOMParser,
epubBytes: ArrayBuffer,
): Promise<string> {
const zip = await JSZip.loadAsync(epubBytes);
const files = zip.files;
const containerPath = "META-INF/container.xml";
Expand All @@ -14,7 +17,7 @@ async function epubToText(epubBytes: ArrayBuffer): Promise<string> {
}
const containerTxt = await container.async("text");

const containerDom = new DOMParser().parseFromString(
const containerDom = domParser.parseFromString(
containerTxt,
"application/xml",
);
Expand All @@ -35,7 +38,7 @@ async function epubToText(epubBytes: ArrayBuffer): Promise<string> {
throw new Error(`opf file not found at ${rootPath}`);
}
const opfTxt = await opf.async("text");
const opfDom = new DOMParser().parseFromString(opfTxt, "application/xml");
const opfDom = domParser.parseFromString(opfTxt, "application/xml");
if (opfDom === undefined) {
throw new Error(`failed to parse XML DOM from ${opfTxt}`);
}
Expand Down
1 change: 1 addition & 0 deletions packages/epub/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"@tsconfig/strictest": "2.0.3",
"@types/html-to-text": "9.0.4",
"@types/node": "20.11.26",
"@xmldom/xmldom": "0.8.10",
"tsx": "4.7.1",
"typescript": "5.4.2"
},
Expand Down
5 changes: 3 additions & 2 deletions packages/web/src/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ async function renderAlignment(
source.arrayBuffer(),
target.arrayBuffer(),
]);
const domParser = new DOMParser();
const [sourceText, targetText] = await Promise.all([
epubToText(sourceData),
epubToText(targetData),
epubToText(domParser, sourceData),
epubToText(domParser, targetData),
]);
const sourceLang = franc(sourceText);
const targetLang = franc(targetText);
Expand Down

0 comments on commit 9abfd74

Please sign in to comment.