Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pass DOM parser to epub tool #457

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion packages/cf/e2e.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ async function startPuppeteer() {
const browser = await puppeteer.launch({ headless: true });

// for debugging, switch to this
// const browser = puppeteer.launch({ headless: false, slowMo: 250 });
// const browser = await puppeteer.launch({ headless: false, slowMo: 250 });

return { browser, [Symbol.dispose]: () => browser.close() };
}
Expand Down
3 changes: 2 additions & 1 deletion packages/cli/epub-to-text.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import type { Readable } from "node:stream";
import { arrayBuffer } from "node:stream/consumers";
import { fileURLToPath } from "node:url";
import { epubToText } from "@bitextual/epub/epub.js";
import { DOMParser } from "@xmldom/xmldom";

export { go };

Expand All @@ -12,7 +13,7 @@ async function main() {

async function go(r: Readable): Promise<string> {
const ab = await arrayBuffer(r);
return epubToText(ab);
return epubToText(new DOMParser(), ab);
}

const currentFile = fileURLToPath(import.meta.url);
Expand Down
1 change: 1 addition & 0 deletions packages/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"dependencies": {
"@bitextual/core": "*",
"@bitextual/epub": "*",
"@xmldom/xmldom": "^0.8.10",
"franc-min": "^6.2.0"
}
}
1 change: 0 additions & 1 deletion packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
},
"dependencies": {
"@bitextual/hunalign": "^0.0.1",
"@xmldom/xmldom": "^0.8.10",
"html-to-text": "^9.0.5",
"jszip": "^3.10.1"
}
Expand Down
5 changes: 3 additions & 2 deletions packages/epub/epub.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { strict as assert } from "node:assert";
import { readFile } from "node:fs/promises";
import { test } from "node:test";
import { fileURLToPath } from "node:url";
import { DOMParser } from "@xmldom/xmldom";
import { epubToText } from "./epub.js";

const EPUB2_PATH_REL = "@bitextual/test/bovary.english.epub";
Expand All @@ -22,12 +23,12 @@ test("epub", async (t) => {

const epubToTextEpub2 = (expected: string) => async () => {
const bytes = await readFile(EPUB2_PATH);
const text = await epubToText(bytes);
const text = await epubToText(new DOMParser(), bytes);
assert.strictEqual(text, expected);
};

const epubToTextEpub3 = (expected: string) => async () => {
const bytes = await readFile(EPUB3_PATH);
const text = await epubToText(bytes);
const text = await epubToText(new DOMParser(), bytes);
assert.strictEqual(text, expected);
};
11 changes: 7 additions & 4 deletions packages/epub/epub.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import { DOMParser } from "@xmldom/xmldom";
import type { DOMParser } from "@xmldom/xmldom";
import { compile as compileHtmlConvert } from "html-to-text";
import JSZip from "jszip";

export { epubToText };

async function epubToText(epubBytes: ArrayBuffer): Promise<string> {
async function epubToText(
domParser: DOMParser,
epubBytes: ArrayBuffer,
): Promise<string> {
const zip = await JSZip.loadAsync(epubBytes);
const files = zip.files;
const containerPath = "META-INF/container.xml";
Expand All @@ -14,7 +17,7 @@ async function epubToText(epubBytes: ArrayBuffer): Promise<string> {
}
const containerTxt = await container.async("text");

const containerDom = new DOMParser().parseFromString(
const containerDom = domParser.parseFromString(
containerTxt,
"application/xml",
);
Expand All @@ -35,7 +38,7 @@ async function epubToText(epubBytes: ArrayBuffer): Promise<string> {
throw new Error(`opf file not found at ${rootPath}`);
}
const opfTxt = await opf.async("text");
const opfDom = new DOMParser().parseFromString(opfTxt, "application/xml");
const opfDom = domParser.parseFromString(opfTxt, "application/xml");
if (opfDom === undefined) {
throw new Error(`failed to parse XML DOM from ${opfTxt}`);
}
Expand Down
1 change: 1 addition & 0 deletions packages/epub/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"@tsconfig/strictest": "2.0.3",
"@types/html-to-text": "9.0.4",
"@types/node": "20.11.26",
"@xmldom/xmldom": "0.8.10",
"tsx": "4.7.1",
"typescript": "5.4.2"
},
Expand Down
4 changes: 3 additions & 1 deletion packages/web/src/main.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { type Remote, wrap } from "comlink";
import { type Remote, proxy, wrap } from "comlink";
import type { RenderAlignmentFn } from "./worker.js";

const worker = new Worker("worker.js", { type: "module" });
Expand Down Expand Up @@ -110,7 +110,9 @@ async function onSubmit(event: Event) {
["version", version],
] as const;

const domParser = proxy(new DOMParser());
const rendered = await renderAlignment(
domParser,
model.sourceFile,
model.targetFile,
meta,
Expand Down
5 changes: 3 additions & 2 deletions packages/web/src/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ expose(renderAlignment);
type RenderAlignmentFn = typeof renderAlignment;

async function renderAlignment(
domParser: DOMParser,
source: File,
target: File,
metaArr: readonly (readonly [string, string])[],
Expand All @@ -21,8 +22,8 @@ async function renderAlignment(
target.arrayBuffer(),
]);
const [sourceText, targetText] = await Promise.all([
epubToText(sourceData),
epubToText(targetData),
epubToText(domParser, sourceData),
epubToText(domParser, targetData),
]);
const sourceLang = franc(sourceText);
const targetLang = franc(targetText);
Expand Down
Loading