Source code for myst_parser.mdit_to_docutils.html_to_nodes

"""Convert HTML to docutils nodes."""

from __future__ import annotations

import re
from typing import TYPE_CHECKING

from docutils import nodes

from myst_parser.parsers.parse_html import Data, tokenize_html
from myst_parser.warnings_ import MystWarnings

if TYPE_CHECKING:
    from .base import DocutilsRenderer



[docs]
def make_error(
    document: nodes.document, error_msg: str, text: str, line_number: int
) -> nodes.system_message:
    return document.reporter.error(
        error_msg,
        nodes.literal_block(text, text),
        line=line_number,
    )



OPTION_KEYS_IMAGE = {"class", "alt", "height", "width", "align", "name"}
# note: docutils also has scale and target

OPTION_KEYS_ADMONITION = {"class", "name"}

# See https://github.com/micromark/micromark-extension-gfm-tagfilter
RE_FLOW = re.compile(
    r"<(\/?)(iframe|noembed|noframes|plaintext|script|style|title|textarea|xmp)(?=[\t\n\f\r />])",
    re.IGNORECASE,
)



[docs]
def default_html(text: str, source: str, line_number: int) -> list[nodes.Element]:
    raw_html = nodes.raw("", text, format="html")
    raw_html.source = source
    raw_html.line = line_number
    return [raw_html]




[docs]
def html_to_nodes(
    text: str, line_number: int, renderer: DocutilsRenderer
) -> list[nodes.Element]:
    """Convert HTML to docutils nodes."""
    if renderer.md_config.gfm_only:
        text, _ = RE_FLOW.subn(lambda s: s.group(0).replace("<", "&lt;"), text)

    enable_html_img = "html_image" in renderer.md_config.enable_extensions
    enable_html_admonition = "html_admonition" in renderer.md_config.enable_extensions
    if not (enable_html_img or enable_html_admonition):
        return default_html(text, renderer.document["source"], line_number)

    # parse the HTML to AST
    try:
        root = tokenize_html(text).strip(inplace=True, recurse=False)
    except Exception:
        msg_node = renderer.create_warning(
            "HTML could not be parsed", MystWarnings.HTML_PARSE, line=line_number
        )
        return ([msg_node] if msg_node else []) + default_html(
            text, renderer.document["source"], line_number
        )

    if len(root) < 1:
        # if empty
        return default_html(text, renderer.document["source"], line_number)

    if not all(
        (enable_html_img and child.name == "img")
        or (
            enable_html_admonition
            and child.name == "div"
            and "admonition" in child.attrs.classes
        )
        for child in root
    ):
        return default_html(text, renderer.document["source"], line_number)

    nodes_list = []
    for child in root:
        if child.name == "img":
            if "src" not in child.attrs:
                return [
                    renderer.reporter.error(
                        "<img> missing 'src' attribute", line=line_number
                    )
                ]
            content = "\n".join(
                f":{k}: {v}"
                for k, v in sorted(child.attrs.items())
                if k in OPTION_KEYS_IMAGE
            )
            nodes_list.extend(
                renderer.run_directive(
                    "image", child.attrs["src"], content, line_number
                )
            )

        else:
            children = child.strip().children
            title = (
                "".join(child.render() for child in children.pop(0))
                if children
                and children[0].name in ("div", "p")
                and (
                    "title" in children[0].attrs.classes
                    or "admonition-title" in children[0].attrs.classes
                )
                else "Note"
            )

            options = "\n".join(
                f":{k}: {v}"
                for k, v in sorted(child.attrs.items())
                if k in OPTION_KEYS_ADMONITION
            ).rstrip()
            new_children = []
            for child in children:
                if child.name == "p":
                    new_children.extend(child.children)
                    new_children.append(Data("\n\n"))
                else:
                    new_children.append(child)
            content = (
                options
                + ("\n\n" if options else "")
                + "".join(child.render() for child in new_children).lstrip()
            )

            nodes_list.extend(
                renderer.run_directive("admonition", title, content, line_number)
            )

    return nodes_list