Skip to content

Commit

Permalink
Use LXML for html_to_vdom (#795)
Browse files Browse the repository at this point in the history
Co-authored-by: Ryan Morshead <[email protected]>
  • Loading branch information
Archmonger and rmorshea authored Aug 14, 2022
1 parent 2f0bb98 commit b06977a
Show file tree
Hide file tree
Showing 6 changed files with 228 additions and 94 deletions.
2 changes: 2 additions & 0 deletions docs/source/about/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Unreleased

**Fixed**

- :issue:`777` - Fix edge cases where ``html_to_vdom`` can fail to convert HTML
- :issue:`789` - Conditionally rendered components cannot use contexts
- :issue:`773` - Use strict equality check for text, numeric, and binary types in hooks
- :issue:`801` - Accidental mutation of old model causes invalid JSON Patch
Expand All @@ -38,6 +39,7 @@ Unreleased
**Added**

- :pull:`123` - ``asgiref`` as a dependency
- :pull:`795` - ``lxml`` as a dependency


v0.39.0
Expand Down
1 change: 1 addition & 0 deletions requirements/pkg-deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ fastjsonschema >=2.14.5
requests >=2
colorlog >=6
asgiref >=3
lxml >= 4
2 changes: 1 addition & 1 deletion src/idom/backend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def run(
implementation: BackendImplementation[Any] | None = None,
) -> None:
"""Run a component with a development server"""
logger.warn(
logger.warning(
"You are running a development server. "
"Change this before deploying in production!"
)
Expand Down
242 changes: 160 additions & 82 deletions src/idom/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
from html.parser import HTMLParser as _HTMLParser
from typing import Any, Callable, Dict, Generic, List, Optional, Tuple, TypeVar
from itertools import chain
from typing import Any, Callable, Generic, Iterable, List, TypeVar, Union

from lxml import etree
from lxml.html import fragments_fromstring

import idom
from idom.core.types import VdomDict


_RefValue = TypeVar("_RefValue")
_ModelTransform = Callable[[VdomDict], Any]
_UNDEFINED: Any = object()


Expand Down Expand Up @@ -49,11 +56,9 @@ def __repr__(self) -> str:
return f"{type(self).__name__}({current})"


_ModelTransform = Callable[[Dict[str, Any]], Any]


def html_to_vdom(source: str, *transforms: _ModelTransform) -> Dict[str, Any]:
"""Transform HTML into a DOM model
def html_to_vdom(html: str, *transforms: _ModelTransform, strict: bool = True) -> VdomDict:
"""Transform HTML into a DOM model. Unique keys can be provided to HTML elements
using a ``key=...`` attribute within your HTML tag.
Parameters:
source:
Expand All @@ -62,81 +67,154 @@ def html_to_vdom(source: str, *transforms: _ModelTransform) -> Dict[str, Any]:
Functions of the form ``transform(old) -> new`` where ``old`` is a VDOM
dictionary which will be replaced by ``new``. For example, you could use a
transform function to add highlighting to a ``<code/>`` block.
strict:
If ``True``, raise an exception if the HTML does not perfectly follow HTML5
syntax.
"""
parser = HtmlParser()
parser.feed(source)
root = parser.model()
to_visit = [root]
while to_visit:
node = to_visit.pop(0)
if isinstance(node, dict) and "children" in node:
transformed = []
for child in node["children"]:
if isinstance(child, dict):
for t in transforms:
child = t(child)
if child is not None:
transformed.append(child)
to_visit.append(child)
node["children"] = transformed
if "attributes" in node and not node["attributes"]:
del node["attributes"]
if "children" in node and not node["children"]:
del node["children"]
return root


class HtmlParser(_HTMLParser):
"""HTML to VDOM parser
Example:
.. code-block::
parser = HtmlParser()
parser.feed(an_html_string)
parser.feed(another_html_string)
...
vdom = parser.model()
if not isinstance(html, str): # pragma: no cover
raise TypeError(f"Expected html to be a string, not {type(html).__name__}")

# If the user provided a string, convert it to a list of lxml.etree nodes
parser = etree.HTMLParser(
remove_comments=True,
remove_pis=True,
remove_blank_text=True,
recover=not strict,
)
try:
nodes: List = fragments_fromstring(html, no_leading_text=True, parser=parser)
except etree.XMLSyntaxError as e:
if not strict:
raise e # pragma: no cover
raise HTMLParseError(
"An error has occurred while parsing the HTML.\n\n"
"This HTML may be malformatted, or may not perfectly adhere to HTML5.\n"
"If you believe the exception above was due to something intentional, "
"you can disable the strict parameter on html_to_vdom().\n"
"Otherwise, repair your broken HTML and try again."
) from e
has_root_node = len(nodes) == 1

# Find or create a root node
if has_root_node:
root_node = nodes[0]
else:
# etree.Element requires a non-empty tag - we correct this below
root_node = etree.Element("TEMP", None, None)
for child in nodes:
root_node.append(child)

# Convert the lxml node to a VDOM dict
vdom = _etree_to_vdom(root_node, transforms)

# Change the artificially created root node to a React Fragment, instead of a div
if not has_root_node:
vdom["tagName"] = ""

return vdom


def _etree_to_vdom(
node: etree._Element, transforms: Iterable[_ModelTransform]
) -> VdomDict:
"""Recusively transform an lxml etree node into a DOM model
Parameters:
source:
The ``lxml.etree._Element`` node
transforms:
Functions of the form ``transform(old) -> new`` where ``old`` is a VDOM
dictionary which will be replaced by ``new``. For example, you could use a
transform function to add highlighting to a ``<code/>`` block.
"""
if not isinstance(node, etree._Element): # pragma: no cover
raise TypeError(
f"Expected node to be a etree._Element, not {type(node).__name__}"
)

def model(self) -> Dict[str, Any]:
"""Get the current state of parsed VDOM model"""
return self._node_stack[0]

def feed(self, data: str) -> None:
"""Feed in HTML that will update the :meth:`HtmlParser.model`"""
self._node_stack.append(self._make_vdom("div", {}))
super().feed(data)

def reset(self) -> None:
"""Reset the state of the parser"""
self._node_stack: List[Dict[str, Any]] = []
super().reset()

def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
new = self._make_vdom(tag, dict(attrs))
current = self._node_stack[-1]
current["children"].append(new)
self._node_stack.append(new)

def handle_endtag(self, tag: str) -> None:
del self._node_stack[-1]

def handle_data(self, data: str) -> None:
self._node_stack[-1]["children"].append(data)

@staticmethod
def _make_vdom(tag: str, attrs: Dict[str, Any]) -> Dict[str, Any]:
if "style" in attrs:
style = attrs["style"]
if isinstance(style, str):
style_dict = {}
for k, v in (part.split(":", 1) for part in style.split(";") if part):
title_case_key = k.title().replace("-", "")
camel_case_key = title_case_key[:1].lower() + title_case_key[1:]
style_dict[camel_case_key] = v
attrs["style"] = style_dict
return {"tagName": tag, "attributes": attrs, "children": []}
# This will recursively call _etree_to_vdom() on all children
children = _generate_vdom_children(node, transforms)

# Convert the lxml node to a VDOM dict
attributes = dict(node.items())
key = attributes.pop("key", None)

if hasattr(idom.html, node.tag):
vdom = getattr(idom.html, node.tag)(attributes, *children, key=key)
else:
vdom: VdomDict = {"tagName": node.tag}
if children:
vdom["children"] = children
if attributes:
vdom["attributes"] = attributes
if key is not None:
vdom["key"] = key

# Perform any necessary mutations on the VDOM attributes to meet VDOM spec
_mutate_vdom(vdom)

# Apply any provided transforms.
for transform in transforms:
vdom = transform(vdom)

return vdom


def _mutate_vdom(vdom: VdomDict):
"""Performs any necessary mutations on the VDOM attributes to meet VDOM spec.
Currently, this function only transforms the ``style`` attribute into a dictionary whose keys are
camelCase so as to be renderable by React.
This function may be extended in the future.
"""
# Determine if the style attribute needs to be converted to a dict
if (
"attributes" in vdom
and "style" in vdom["attributes"]
and isinstance(vdom["attributes"]["style"], str)
):
# Convince type checker that it's safe to mutate attributes
assert isinstance(vdom["attributes"], dict)

# Convert style attribute from str -> dict with camelCase keys
vdom["attributes"]["style"] = {
_hypen_to_camel_case(key.strip()): value.strip()
for key, value in (
part.split(":", 1)
for part in vdom["attributes"]["style"].split(";")
if ":" in part
)
}


def _generate_vdom_children(
node: etree._Element, transforms: Iterable[_ModelTransform]
) -> List[Union[VdomDict, str]]:
"""Generates a list of VDOM children from an lxml node.
Inserts inner text and/or tail text inbetween VDOM children, if necessary.
"""
return ( # Get the inner text of the current node
[node.text] if node.text else []
) + list(
chain(
*(
# Recursively convert each child node to VDOM
[_etree_to_vdom(child, transforms)]
# Insert the tail text between each child node
+ ([child.tail] if child.tail else [])
for child in node.iterchildren(None)
)
)
)


def _hypen_to_camel_case(string: str) -> str:
"""Convert a hypenated string to camelCase."""
first, _, remainder = string.partition("-")
return first.lower() + remainder.title().replace("-", "")


class HTMLParseError(etree.LxmlSyntaxError):
"""Raised when an HTML document cannot be parsed using strict parsing."""
2 changes: 1 addition & 1 deletion src/idom/widgets.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def use_linked_inputs(
value, set_value = idom.hooks.use_state(initial_value)

def sync_inputs(event: Dict[str, Any]) -> None:
new_value = event["value"]
new_value = event["target"]["value"]
set_value(new_value)
if not new_value and ignore_empty:
return None
Expand Down
Loading

0 comments on commit b06977a

Please sign in to comment.