Source code for myst_parser.parsers.parse_html

"""A simple but complete HTML to Abstract Syntax Tree (AST) parser.

The AST can also reproduce the HTML text.

Example::

    >> text = '<div class="note"><p>text</p></div>'
    >> ast = tokenize_html(text)
    >> list(ast.walk(include_self=True))
    [Root(''), Tag('div', {'class': 'note'}), Tag('p'), Data('text')]
    >> str(ast)
    '<div class="note"><p>text</p></div>'
    >> str(ast[0][0])
    '<p>text</p>'

Note: optional tags are not accounted for
(see https://html.spec.whatwg.org/multipage/syntax.html#optional-tags)

"""

from __future__ import annotations

import inspect
import itertools
from collections import abc, deque
from collections.abc import Callable, Iterable, Iterator
from html.parser import HTMLParser
from typing import Any


[docs] class Attribute(dict): """This class holds the tags's attributes.""" def __getitem__(self, key: str) -> str: """If self doesn't have the key it returns ''.""" return self.get(key, "") @property def classes(self) -> list[str]: """Return 'class' attribute as list.""" return self["class"].split() def __str__(self) -> str: """Return a htmlized representation for attributes.""" return " ".join(f'{key}="{value}"' for key, value in self.items())
[docs] class Element(abc.MutableSequence): """An Element of the xml/html document. All xml/html entities inherit from this class. """ def __init__(self, name: str = "", attr: dict | None = None) -> None: """Initialise the element.""" self.name = name self.attrs: Attribute = Attribute(attr or {}) self._parent: Element | None = None self._children: list[Element] = [] @property def parent(self) -> Element | None: """Return parent.""" return self._parent @property def children(self) -> list[Element]: """Return copy of children.""" return self._children[:]
[docs] def reset_children(self, children: list[Element], deepcopy: bool = False): new_children = [] for i, item in enumerate(children): assert isinstance(item, Element) if deepcopy: item = item.deepcopy() if item._parent is None: item._parent = self elif item._parent != self: raise AssertionError(f"different parent already set for item {i}") new_children.append(item) self._children = new_children
def __getitem__(self, index: int) -> Element: # type: ignore[override] return self._children[index] def __setitem__(self, index: int, item: Element): # type: ignore[override] assert isinstance(item, Element) if item._parent is not None and item._parent != self: raise AssertionError(f"different parent already set for: {item!r}") item._parent = self return self._children.__setitem__(index, item) def __delitem__(self, index: int): # type: ignore[override] return self._children.__delitem__(index) def __len__(self) -> int: return self._children.__len__() def __iter__(self) -> Iterator[Element]: yield from self._children
[docs] def insert(self, index: int, item: Element): assert isinstance(item, Element) if item._parent is not None and item._parent != self: raise AssertionError(f"different parent already set for: {item!r}") item._parent = self return self._children.insert(index, item)
[docs] def deepcopy(self) -> Element: """Recursively copy and remove parent.""" _copy = self.__class__(self.name, self.attrs) for child in self: _copy_child = child.deepcopy() _copy.append(_copy_child) return _copy
def __repr__(self) -> str: text = f"{self.__class__.__name__}({self.name!r}" if self.attrs: text += f", {self.attrs!r}" text += ")" return text
[docs] def render( self, tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None, **kwargs, ) -> str: """Returns a HTML string representation of the element. :param tag_overrides: Provide a dictionary of render function for specific tag names, to override the normal render format """ raise NotImplementedError
def __str__(self) -> str: return self.render() def __eq__(self, item: Any) -> bool: return item is self
[docs] def walk(self, include_self: bool = False) -> Iterator[Element]: """Walk through the xml/html AST.""" if include_self: yield self for child in self: yield child yield from child.walk()
[docs] def strip(self, inplace: bool = False, recurse: bool = False) -> Element: """Return copy with all `Data` tokens that only contain whitespace / newlines removed. """ element = self if not inplace: element = self.deepcopy() element.reset_children( [ e for e in element.children if not (isinstance(e, Data) and e.data.strip() == "") ] ) if recurse: for child in element: child.strip(inplace=True, recurse=True) return element
[docs] def find( self, identifier: str | type[Element], attrs: dict | None = None, classes: Iterable[str] | None = None, include_self: bool = False, recurse: bool = True, ) -> Iterator[Element]: """Find all elements that match name and specific attributes.""" iterator = self.walk() if recurse else self if include_self: iterator = itertools.chain([self], iterator) test_func = ( (lambda c: isinstance(c, identifier)) if inspect.isclass(identifier) else lambda c: c.name == identifier ) classes = set(classes) if classes is not None else classes for child in iterator: if test_func(child): if classes is not None and not classes.issubset(child.attrs.classes): continue for key, value in (attrs or {}).items(): if child.attrs[key] != value: break else: yield child
[docs] class Root(Element): """The root of the AST tree."""
[docs] def render(self, **kwargs) -> str: # type: ignore[override] """Returns a string HTML representation of the structure.""" return "".join(child.render(**kwargs) for child in self)
[docs] class Tag(Element): """Represent xml/html tags under the form: <name key="value" ...> ... </name>."""
[docs] def render( self, tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None, **kwargs, ) -> str: if tag_overrides and self.name in tag_overrides: return tag_overrides[self.name](self, tag_overrides) return ( f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>" + "".join( child.render(tag_overrides=tag_overrides, **kwargs) for child in self ) + f"</{self.name}>" )
[docs] class XTag(Element): """Represent XHTML style tags with no children, like `<img src="t.gif" />`"""
[docs] def render( self, tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None, **kwargs, ) -> str: if tag_overrides is not None and self.name in tag_overrides: return tag_overrides[self.name](self, tag_overrides) return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}/>"
[docs] class VoidTag(Element): """Represent tags with no children, only start tag, like `<img src="t.gif" >`"""
[docs] def render(self, **kwargs) -> str: # type: ignore[override] return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>"
[docs] class TerminalElement(Element): def __init__(self, data: str): super().__init__("") self.data: str = data def __repr__(self) -> str: text = self.data if len(text) > 20: text = text[:17] + "..." return f"{self.__class__.__name__}({text!r})"
[docs] def deepcopy(self) -> TerminalElement: """Copy and remove parent.""" _copy = self.__class__(self.data) return _copy
[docs] class Data(TerminalElement): """Represent data inside xml/html documents, like raw text."""
[docs] def render(self, **kwargs) -> str: # type: ignore[override] return self.data
[docs] class Declaration(TerminalElement): """Represent declarations, like `<!DOCTYPE html>`"""
[docs] def render(self, **kwargs) -> str: # type: ignore[override] return f"<!{self.data}>"
[docs] class Comment(TerminalElement): """Represent HTML comments"""
[docs] def render(self, **kwargs) -> str: # type: ignore[override] return f"<!--{self.data}-->"
[docs] class Pi(TerminalElement): """Represent processing instructions like `<?xml-stylesheet ?>`"""
[docs] def render(self, **kwargs) -> str: # type: ignore[override] return f"<?{self.data}>"
[docs] class Char(TerminalElement): """Represent character codes like: `&#0`"""
[docs] def render(self, **kwargs) -> str: # type: ignore[override] return f"&#{self.data};"
[docs] class Entity(TerminalElement): """Represent entities like `&amp`"""
[docs] def render(self, **kwargs) -> str: # type: ignore[override] return f"&{self.data};"
[docs] class Tree: """The engine class to generate the AST tree.""" def __init__(self, name: str = ""): """Initialise Tree""" self.name = name self.outmost = Root(name) self.stack: deque = deque() self.stack.append(self.outmost)
[docs] def clear(self): """Clear the outmost and stack for a new parsing.""" self.outmost = Root(self.name) self.stack.clear() self.stack.append(self.outmost)
[docs] def last(self) -> Element: """Return the last pointer which point to the actual tag scope.""" return self.stack[-1]
[docs] def nest_tag(self, name: str, attrs: dict): """Nest a given tag at the bottom of the tree using the last stack's pointer. """ pointer = self.stack.pop() item = Tag(name, attrs) pointer.append(item) self.stack.append(pointer) self.stack.append(item)
[docs] def nest_xtag(self, name: str, attrs: dict): """Nest an XTag onto the tree.""" top = self.last() item = XTag(name, attrs) top.append(item)
[docs] def nest_vtag(self, name: str, attrs: dict): """Nest a VoidTag onto the tree.""" top = self.last() item = VoidTag(name, attrs) top.append(item)
[docs] def nest_terminal(self, klass: type[TerminalElement], data: str): """Nest the data onto the tree.""" top = self.last() item = klass(data) top.append(item)
[docs] def enclose(self, name: str): """When a closing tag is found, pop the pointer's scope from the stack, to then point to the earlier scope's tag. """ count = 0 for ind in reversed(self.stack): count = count + 1 if ind.name == name: break else: count = 0 # It pops all the items which do not match with the closing tag. for _ in range(count): self.stack.pop()
[docs] class HtmlToAst(HTMLParser): """The tokenizer class.""" # see https://html.spec.whatwg.org/multipage/syntax.html#void-elements void_elements = { "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr", } def __init__(self, name: str = "", convert_charrefs: bool = False): super().__init__(convert_charrefs=convert_charrefs) self.struct = Tree(name)
[docs] def feed(self, source: str) -> Root: # type: ignore[override] """Parse the source string.""" self.struct.clear() super().feed(source) return self.struct.outmost
[docs] def handle_starttag(self, name: str, attr): """When found an opening tag then nest it onto the tree.""" if name in self.void_elements: self.struct.nest_vtag(name, attr) else: self.struct.nest_tag(name, attr)
[docs] def handle_startendtag(self, name: str, attr): """When found a XHTML tag style then nest it up to the tree.""" self.struct.nest_xtag(name, attr)
[docs] def handle_endtag(self, name: str): """When found a closing tag then makes it point to the right scope.""" if name not in self.void_elements: self.struct.enclose(name)
[docs] def handle_data(self, data: str): """Nest data onto the tree.""" self.struct.nest_terminal(Data, data)
[docs] def handle_decl(self, decl: str): self.struct.nest_terminal(Declaration, decl)
[docs] def unknown_decl(self, decl: str): self.struct.nest_terminal(Declaration, decl)
[docs] def handle_charref(self, data: str): self.struct.nest_terminal(Char, data)
[docs] def handle_entityref(self, data: str): self.struct.nest_terminal(Entity, data)
[docs] def handle_pi(self, data: str): self.struct.nest_terminal(Pi, data)
[docs] def handle_comment(self, data: str): self.struct.nest_terminal(Comment, data)
[docs] def tokenize_html(text: str, name: str = "", convert_charrefs: bool = False) -> Root: parser = HtmlToAst(name, convert_charrefs=convert_charrefs) return parser.feed(text)