Skip to content

Commit

Permalink
support multi-char code points in encodeCharacters
Browse files Browse the repository at this point in the history
also provide a natural way to disable encoding
for characters enabled by default
(mainly for markdown, because html-to-text doesn't come with any)
  • Loading branch information
KillyMXI committed Dec 7, 2022
1 parent d960c11 commit 06c4272
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 6 deletions.
12 changes: 8 additions & 4 deletions packages/base/src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { parseDocument } from 'htmlparser2';
import { DecisionTree } from 'selderee';

import { BlockTextBuilder } from './block-text-builder';
import { limitedDepthRecursive } from './util';
import { limitedDepthRecursive, unicodeEscape } from './util';


/**
Expand Down Expand Up @@ -162,20 +162,24 @@ function recursiveWalk (walk, dom, builder) {
}

/**
* @param { Object<string,string> } dict
* @param { Object<string,string | false> } dict
* A dictionary where keys are characters to replace
* and values are replacement strings.
*
* First code point from dict keys is used.
* Compound emojis with ZWJ are not supported (not until Node 16).
*
* @returns { ((str: string) => string) | undefined }
*/
function makeReplacerFromDict (dict) {
if (!dict || Object.keys(dict).length === 0) {
return undefined;
}
const entries = [...Object.entries(dict)];
/** @type { [string, string][] } */
const entries = Object.entries(dict).filter(([, v]) => v !== false);
const regex = new RegExp(
entries
.map(([c]) => `(\\u${(c.charCodeAt(0).toString(16).padStart(4, '0'))})`)
.map(([c]) => `(${unicodeEscape([...c][0])})`)
.join('|'),
'g'
);
Expand Down
3 changes: 2 additions & 1 deletion packages/base/src/typedefs.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
* @property { boolean } [decodeEntities]
* Specify whether HTML entities should be decoded in the text output.
*
* @property { Object<string,string> | ((str: string) => string) | undefined } [encodeCharacters]
* @property { Object<string,string|false> | ((str: string) => string) | undefined } [encodeCharacters]
* A dictionary mapping from input text characters to escape sequences
* (you can set values to false to disable escaping characters that are enabled by default)
* or a function that does the replacement.
*
* @property { Object< string, FormatCallback > } [formatters = {}]
Expand Down
14 changes: 13 additions & 1 deletion packages/base/src/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@ function trimCharacterEnd (str, char) {
: str;
}

/**
* Return a new string will all characters replaced with unicode escape sequences.
* This extreme kind of escaping can used to be safely compose regular expressions.
*
* @param { string } str A string to escape.
* @returns { string } A string of unicode escape sequences.
*/
function unicodeEscape (str) {
return str.replace(/[\s\S]/g, c => '\\u' + c.charCodeAt().toString(16).padStart(4, '0'));
}

/**
* Deduplicate an array by a given key callback.
* Item properties are merged recursively and with the preference for last defined values.
Expand Down Expand Up @@ -149,5 +160,6 @@ export {
numberToLetterSequence,
numberToRoman,
trimCharacter,
trimCharacterEnd
trimCharacterEnd,
unicodeEscape
};
14 changes: 14 additions & 0 deletions packages/html-to-md/test/html-to-md.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,17 @@ test(
snapshotMacro,
'<img src="test.png" alt="**alt text**" title="*title*">'
);

test(
'should allow to disable encoding of some characters encoded by default',
snapshotMacro,
'<p>!#[]()*+-.\\_`{}</p>',
{ encodeCharacters: { '(': '(', ')': false } }
);

test(
'should allow to encode additional symbols (single code point)',
snapshotMacro,
'<p>!#[]()*+-.\\_`{}</p><p>πŸ‘οΈ - eye</p><p>πŸ‘οΈβ€πŸ—¨οΈ - eye in a speech bubble</p><p>πŸ˜€ - smiley</p>',
{ encodeCharacters: { 'πŸ‘οΈ': ':eye:', 'πŸ˜€': ':smiley:' } }
);
22 changes: 22 additions & 0 deletions packages/html-to-md/test/snapshots/html-to-md.js.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,25 @@ Generated by [AVA](https://avajs.dev).
> ```
'![&ast;&ast;alt text&ast;&ast;](test.png "&ast;title&ast;")'
## should allow to disable encoding of some characters encoded by default
> ```html
> <p>!#[]()*+-.\_`{}</p>
> ```
'&excl;&num;&lbrack;&rbrack;()&ast;&plus;&#45;&period;&bsol;&lowbar;&grave;&lbrace;&rbrace;'
## should allow to encode additional symbols (single code point)
> ```html
> <p>!#[]()*+-.\_`{}</p><p>πŸ‘οΈ - eye</p><p>πŸ‘οΈβ€πŸ—¨οΈ - eye in a speech bubble</p><p>πŸ˜€ - smiley</p>
> ```
`&excl;&num;&lbrack;&rbrack;&lpar;&rpar;&ast;&plus;&#45;&period;&bsol;&lowbar;&grave;&lbrace;&rbrace;␊
␊
:eye:️ &#45; eye␊
␊
:eye:οΈβ€πŸ—¨οΈ &#45; eye in a speech bubble␊
␊
:smiley: &#45; smiley`
Binary file modified packages/html-to-md/test/snapshots/html-to-md.js.snap
Binary file not shown.

0 comments on commit 06c4272

Please sign in to comment.