Skip to content

Commit

Permalink
Fix microformats#95: Don't add slashes to void elements
Browse files Browse the repository at this point in the history
Beautifulsoup 4.6.2 introduced the ability to control the slashes in void elements through formatters.
This adds a formatter that does this, but otherwise only does minimal encoding (as the previous, default, formatter did)
  • Loading branch information
sknebel committed Oct 2, 2018
1 parent 2e84386 commit 0e5d20a
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 3 deletions.
7 changes: 6 additions & 1 deletion mf2py/dom_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import bs4
import re

from bs4.element import Tag, NavigableString, Comment
from bs4.element import Tag, NavigableString, Comment, MinimalHTMLFormatter

if sys.version < '3':
from urlparse import urljoin
Expand All @@ -21,6 +21,11 @@
_reduce_spaces_regex = re.compile(r" {2,}")


class MinimalHTML5Formatter(MinimalHTMLFormatter):
"""An HTML formatter that omits the slash in void tags and othewise does minimal replacement"""
void_element_close_prefix = None


def try_urljoin(base, url, allow_fragments=True):
"""attempts urljoin, on ValueError passes through url. Shortcuts http(s):// urls"""
if url.startswith(("https://", "http://")):
Expand Down
4 changes: 2 additions & 2 deletions mf2py/parse_property.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""functions to parse the properties of elements"""
from __future__ import unicode_literals, print_function

from .dom_helpers import get_attr, get_img_src_alt, get_textContent, try_urljoin
from .dom_helpers import get_attr, get_img_src_alt, get_textContent, try_urljoin, MinimalHTML5Formatter
from .datetime_helpers import normalize_datetime, DATETIME_RE, TIME_RE
from . import value_class_pattern

Expand Down Expand Up @@ -104,6 +104,6 @@ def datetime(el, default_date=None):
def embedded(el, base_url=''):
"""Process e-* properties"""
return {
'html': el.decode_contents().strip(), # secret bs4 method to get innerHTML
'html': el.decode_contents(formatter=MinimalHTML5Formatter()).strip(), # secret bs4 method to get innerHTML
'value': get_textContent(el, replace_img=True, base_url=base_url)
}

0 comments on commit 0e5d20a

Please sign in to comment.