Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 FIX: numeric character reference passing #272

Merged
merged 2 commits into from
Jun 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 20 additions & 54 deletions markdown_it/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
"""
from __future__ import annotations

import html
import re
from typing import Match, TypeVar

Expand Down Expand Up @@ -52,9 +51,6 @@ def arrayReplaceAt(
return src[:pos] + newElements + src[pos + 1 :]


######################################################################


def isValidEntityCode(c: int) -> bool:
# broken sequence
if c >= 0xD800 and c <= 0xDFFF:
Expand Down Expand Up @@ -89,47 +85,33 @@ def fromCodePoint(c: int) -> str:
return chr(c)


UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
UNESCAPE_ALL_RE = re.compile(
r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
re.IGNORECASE,
)
DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE)
DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})")
DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE)


def replaceEntityPattern(match: str, name: str) -> str:
"""Convert HTML entity patterns

::

https://www.google.com -> https%3A//www.google.com

"""Convert HTML entity patterns,
see https://spec.commonmark.org/0.30/#entity-references
"""
code = 0

if name in entities:
return entities[name]

if name[0] == "#" and DIGITAL_ENTITY_TEST_RE.search(name):
code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10)
if isValidEntityCode(code):
return fromCodePoint(code)

return match


# def replaceEntities(string):
# if (string.indexOf('&') < 0):
# return string
# return string.replace(ENTITY_RE, replaceEntityPattern)
code: None | int = None
if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name):
code = int(pat.group(1), 10)
elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name):
code = int(pat.group(1), 16)

if code is not None and isValidEntityCode(code):
return fromCodePoint(code)

def unescapeMd(string: str) -> str:
raise NotImplementedError
# if "\\" in string:
# return string
# return string.replace(UNESCAPE_MD_RE, "$1")
return match


def unescapeAll(string: str) -> str:
Expand All @@ -154,30 +136,14 @@ def stripEscape(string: str) -> str:
return ESCAPE_CHAR.sub(r"\1", string)


# //////////////////////////////////////////////////////////////////////////////

# TODO This section changed quite a lot, should re-check

# UNESCAPE_HTML_RE = re.compile(r"\\&(?=(amp\;|lt\;|gt\;|quot\;))")
# ESCAPE_AND_HTML = re.compile(r"&(?!(amp\;|lt\;|gt\;|quot\;))")
# HTML_ESCAPE_REPLACE_RE = re.compile(r'[&<>"]')


# def escapeHtml(string: str):

# if HTML_ESCAPE_REPLACE_RE.search(string):

# string = UNESCAPE_HTML_RE.sub("&", string)
# string = ESCAPE_AND_HTML.sub("&amp;", string)
# for k, v in {"<": "&lt;", ">": "&gt;", '"': "&quot;"}.items():
# string = string.replace(k, v)

# return string


def escapeHtml(raw: str) -> str:
# return html.escape(html.unescape(raw)).replace("&#x27;", "'")
return html.escape(raw).replace("&#x27;", "'")
"""Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""
# like html.escape, but without escaping single quotes
raw = raw.replace("&", "&amp;") # Must be done first!
raw = raw.replace("<", "&lt;")
raw = raw.replace(">", "&gt;")
raw = raw.replace('"', "&quot;")
return raw


# //////////////////////////////////////////////////////////////////////////////
Expand Down
14 changes: 7 additions & 7 deletions tests/test_fuzzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@
from markdown_it import MarkdownIt

TESTS = {
55363: ">```\n>",
55367: ">-\n>\n>",
# 55371: "[](so&#4»0;!" TODO this did not fail
# 55401: "?c_" * 100_000 TODO this did not fail
55363: (">```\n>", "<blockquote>\n<pre><code></code></pre>\n</blockquote>\n"),
55367: (">-\n>\n>", "<blockquote>\n<ul>\n<li></li>\n</ul>\n</blockquote>\n"),
55371: ("[](so&#4H0;!", "<p>[](so&amp;#4H0;!</p>\n"),
# 55401: (("?c_" * 100000) + "c_", ""), TODO this does not fail, just takes a long time
}


@pytest.mark.parametrize("raw_input", TESTS.values(), ids=TESTS.keys())
def test_fuzzing(raw_input):
@pytest.mark.parametrize("raw_input,expected", TESTS.values(), ids=TESTS.keys())
def test_fuzzing(raw_input, expected):
md = MarkdownIt()
md.parse(raw_input)
print(md.render(raw_input))
assert md.render(raw_input) == expected
9 changes: 9 additions & 0 deletions tests/test_port/fixtures/issue-fixes.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,12 @@ Fix CVE-2023-26303
<p><img src="%5B" alt="
" /></p>
.

Fix parsing of incorrect numeric character references
.
[](&#X22y;) &#X22y;
[](&#35y;) &#35y;
.
<p><a href="&amp;#X22y;"></a> &amp;#X22y;
<a href="&amp;#35y;"></a> &amp;#35y;</p>
.