executablebooks · chrisjsewell · Jun 2, 2023 · Jun 2, 2023 · Jun 2, 2023
diff --git a/markdown_it/common/utils.py b/markdown_it/common/utils.py
@@ -2,7 +2,6 @@
 """
 from __future__ import annotations
 
-import html
 import re
 from typing import Match, TypeVar
 
@@ -52,9 +51,6 @@ def arrayReplaceAt(
  return src[:pos] + newElements + src[pos + 1 :]
 
 
-######################################################################
-
-
 def isValidEntityCode(c: int) -> bool:
  # broken sequence
  if c >= 0xD800 and c <= 0xDFFF:
@@ -89,47 +85,33 @@ def fromCodePoint(c: int) -> str:
  return chr(c)
 
 
-UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
+# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
 # ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
 UNESCAPE_ALL_RE = re.compile(
  r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
  re.IGNORECASE,
 )
-DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE)
+DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})")
+DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE)
 
 
 def replaceEntityPattern(match: str, name: str) -> str:
- """Convert HTML entity patterns
-
- ::
-
- https://www.google.com -> https%3A//www.google.com
-
+ """Convert HTML entity patterns,
+ see https://spec.commonmark.org/0.30/#entity-references
  """
- code = 0
-
  if name in entities:
  return entities[name]
 
- if name[0] == "#" and DIGITAL_ENTITY_TEST_RE.search(name):
- code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10)
- if isValidEntityCode(code):
- return fromCodePoint(code)
-
- return match
-
-
-# def replaceEntities(string):
-# if (string.indexOf('&') < 0):
-# return string
-# return string.replace(ENTITY_RE, replaceEntityPattern)
+ code: None | int = None
+ if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name):
+ code = int(pat.group(1), 10)
+ elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name):
+ code = int(pat.group(1), 16)
 
+ if code is not None and isValidEntityCode(code):
+ return fromCodePoint(code)
 
-def unescapeMd(string: str) -> str:
- raise NotImplementedError
- # if "\\" in string:
- # return string
- # return string.replace(UNESCAPE_MD_RE, "$1")
+ return match
 
 
 def unescapeAll(string: str) -> str:
@@ -154,30 +136,14 @@ def stripEscape(string: str) -> str:
  return ESCAPE_CHAR.sub(r"\1", string)
 
 
-# //////////////////////////////////////////////////////////////////////////////
-
-# TODO This section changed quite a lot, should re-check
-
-# UNESCAPE_HTML_RE = re.compile(r"\\&(?=(amp\;|lt\;|gt\;|quot\;))")
-# ESCAPE_AND_HTML = re.compile(r"&(?!(amp\;|lt\;|gt\;|quot\;))")
-# HTML_ESCAPE_REPLACE_RE = re.compile(r'[&<>"]')
-
-
-# def escapeHtml(string: str):
-
-# if HTML_ESCAPE_REPLACE_RE.search(string):
-
-# string = UNESCAPE_HTML_RE.sub("&", string)
-# string = ESCAPE_AND_HTML.sub("&amp;", string)
-# for k, v in {"<": "&lt;", ">": "&gt;", '"': "&quot;"}.items():
-# string = string.replace(k, v)
-
-# return string
-
-
 def escapeHtml(raw: str) -> str:
- # return html.escape(html.unescape(raw)).replace("&#x27;", "'")
- return html.escape(raw).replace("&#x27;", "'")
+ """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""
+ # like html.escape, but without escaping single quotes
+ raw = raw.replace("&", "&amp;") # Must be done first!
+ raw = raw.replace("<", "&lt;")
+ raw = raw.replace(">", "&gt;")
+ raw = raw.replace('"', "&quot;")
+ return raw
 
 
 # //////////////////////////////////////////////////////////////////////////////

diff --git a/tests/test_fuzzer.py b/tests/test_fuzzer.py
@@ -10,15 +10,15 @@
 from markdown_it import MarkdownIt
 
 TESTS = {
- 55363: ">```\n>",
- 55367: ">-\n>\n>",
- # 55371: "[](so&#4»0;!" TODO this did not fail
- # 55401: "?c_" * 100_000 TODO this did not fail
+ 55363: (">```\n>", "<blockquote>\n<pre><code></code></pre>\n</blockquote>\n"),
+ 55367: (">-\n>\n>", "<blockquote>\n<ul>\n<li></li>\n</ul>\n</blockquote>\n"),
+ 55371: ("[](so&#4H0;!", "<p>[](so&amp;#4H0;!</p>\n"),
+ # 55401: (("?c_" * 100000) + "c_", ""), TODO this does not fail, just takes a long time
 }
 
 
-@pytest.mark.parametrize("raw_input", TESTS.values(), ids=TESTS.keys())
-def test_fuzzing(raw_input):
+@pytest.mark.parametrize("raw_input,expected", TESTS.values(), ids=TESTS.keys())
+def test_fuzzing(raw_input, expected):
  md = MarkdownIt()
  md.parse(raw_input)
- print(md.render(raw_input))
+ assert md.render(raw_input) == expected
diff --git a/tests/test_port/fixtures/issue-fixes.md b/tests/test_port/fixtures/issue-fixes.md
@@ -45,3 +45,12 @@ Fix CVE-2023-26303
 <p><img src="%5B" alt="
 " /></p>
 .
+
+Fix parsing of incorrect numeric character references
+.
+[](&#X22y;) &#X22y;
+[](&#35y;) &#35y;
+.
+<p><a href="&amp;#X22y;"></a> &amp;#X22y;
+<a href="&amp;#35y;"></a> &amp;#35y;</p>
+.