Source code for myst_parser.inventory

"""Logic for dealing with sphinx style inventories (e.g. `objects.inv`).

These contain mappings of reference names to ids, scoped by domain and object type.

This is adapted from the Sphinx inventory.py module.
We replicate it here, so that it can be used without Sphinx.
"""

from __future__ import annotations

import argparse
import functools
import json
import re
import zlib
from collections.abc import Iterator
from dataclasses import asdict, dataclass
from typing import IO, TYPE_CHECKING, TypedDict
from urllib.request import urlopen

import yaml

if TYPE_CHECKING:
    # domain_type:object_type -> name -> (project, version, loc, text)
    # the `loc` includes the base url, also null `text` is denoted by "-"
    from sphinx.util.typing import Inventory as SphinxInventoryType



[docs]
class InventoryItemType(TypedDict):
    """A single inventory item."""

    loc: str
    """The location of the item (relative if base_url not None)."""
    text: str | None
    """Implicit text to show for the item."""




[docs]
class InventoryType(TypedDict):
    """Inventory data."""

    name: str
    """The name of the project."""
    version: str
    """The version of the project."""
    base_url: str | None
    """The base URL of the `loc`."""
    objects: dict[str, dict[str, dict[str, InventoryItemType]]]
    """Mapping of domain -> object type -> name -> item."""




[docs]
def from_sphinx(inv: SphinxInventoryType) -> InventoryType:
    """Convert from a Sphinx compliant format."""
    project = ""
    version = ""
    objs: dict[str, dict[str, dict[str, InventoryItemType]]] = {}
    for domain_obj_name, data in inv.items():
        if ":" not in domain_obj_name:
            continue

        domain_name, obj_type = domain_obj_name.split(":", 1)
        objs.setdefault(domain_name, {}).setdefault(obj_type, {})
        for refname, refdata in data.items():
            project, version, uri, text = refdata
            objs[domain_name][obj_type][refname] = {
                "loc": uri,
                "text": None if (not text or text == "-") else text,
            }

    return {
        "name": project,
        "version": version,
        "base_url": None,
        "objects": objs,
    }




[docs]
def to_sphinx(inv: InventoryType) -> SphinxInventoryType:
    """Convert to a Sphinx compliant format."""
    objs: SphinxInventoryType = {}
    for domain_name, obj_types in inv["objects"].items():
        for obj_type, refs in obj_types.items():
            for refname, refdata in refs.items():
                objs.setdefault(f"{domain_name}:{obj_type}", {})[refname] = (  # type: ignore[assignment]
                    inv["name"],
                    inv["version"],
                    refdata["loc"],
                    refdata["text"] or "-",
                )
    return objs




[docs]
def load(stream: IO, base_url: str | None = None) -> InventoryType:
    """Load inventory data from a stream."""
    reader = InventoryFileReader(stream)
    line = reader.readline().rstrip()
    if line == "# Sphinx inventory version 1":
        return _load_v1(reader, base_url)
    elif line == "# Sphinx inventory version 2":
        return _load_v2(reader, base_url)
    else:
        raise ValueError(f"invalid inventory header: {line}")



def _load_v1(stream: InventoryFileReader, base_url: str | None) -> InventoryType:
    """Load inventory data (format v1) from a stream."""
    projname = stream.readline().rstrip()[11:]
    version = stream.readline().rstrip()[11:]
    invdata: InventoryType = {
        "name": projname,
        "version": version,
        "base_url": base_url,
        "objects": {},
    }
    for line in stream.readlines():
        name, objtype, location = line.rstrip().split(None, 2)
        # version 1 did not add anchors to the location
        domain = "py"
        if objtype == "mod":
            objtype = "module"
            location += "#module-" + name
        else:
            location += "#" + name
        invdata["objects"].setdefault(domain, {}).setdefault(objtype, {})
        invdata["objects"][domain][objtype][name] = {"loc": location, "text": None}

    return invdata


def _load_v2(stream: InventoryFileReader, base_url: str | None) -> InventoryType:
    """Load inventory data (format v2) from a stream."""
    projname = stream.readline().rstrip()[11:]
    version = stream.readline().rstrip()[11:]
    invdata: InventoryType = {
        "name": projname,
        "version": version,
        "base_url": base_url,
        "objects": {},
    }
    line = stream.readline()
    if "zlib" not in line:
        raise ValueError(f"invalid inventory header (not compressed): {line}")

    for line in stream.read_compressed_lines():
        # be careful to handle names with embedded spaces correctly
        m = re.match(r"(?x)(.+?)\s+(\S+)\s+(-?\d+)\s+?(\S*)\s+(.*)", line.rstrip())
        if not m:
            continue
        name: str
        type: str
        name, type, _, location, text = m.groups()
        if ":" not in type:
            # wrong type value. type should be in the form of "{domain}:{objtype}"
            #
            # Note: To avoid the regex DoS, this is implemented in python (refs: #8175)
            continue
        if (
            type == "py:module"
            and type in invdata["objects"]
            and name in invdata["objects"][type]
        ):
            # due to a bug in 1.1 and below,
            # two inventory entries are created
            # for Python modules, and the first
            # one is correct
            continue
        if location.endswith("$"):
            location = location[:-1] + name
        domain, objtype = type.split(":", 1)
        invdata["objects"].setdefault(domain, {}).setdefault(objtype, {})
        if not text or text == "-":
            text = None
        invdata["objects"][domain][objtype][name] = {"loc": location, "text": text}
    return invdata


_BUFSIZE = 16 * 1024



[docs]
class InventoryFileReader:
    """A file reader for an inventory file.

    This reader supports mixture of texts and compressed texts.
    """

    def __init__(self, stream: IO) -> None:
        self.stream = stream
        self.buffer = b""
        self.eof = False


[docs]
    def read_buffer(self) -> None:
        chunk = self.stream.read(_BUFSIZE)
        if chunk == b"":
            self.eof = True
        self.buffer += chunk



[docs]
    def readline(self) -> str:
        pos = self.buffer.find(b"\n")
        if pos != -1:
            line = self.buffer[:pos].decode()
            self.buffer = self.buffer[pos + 1 :]
        elif self.eof:
            line = self.buffer.decode()
            self.buffer = b""
        else:
            self.read_buffer()
            line = self.readline()

        return line



[docs]
    def readlines(self) -> Iterator[str]:
        while not self.eof:
            line = self.readline()
            if line:
                yield line



[docs]
    def read_compressed_chunks(self) -> Iterator[bytes]:
        decompressor = zlib.decompressobj()
        while not self.eof:
            self.read_buffer()
            yield decompressor.decompress(self.buffer)
            self.buffer = b""
        yield decompressor.flush()



[docs]
    def read_compressed_lines(self) -> Iterator[str]:
        buf = b""
        for chunk in self.read_compressed_chunks():
            buf += chunk
            pos = buf.find(b"\n")
            while pos != -1:
                yield buf[:pos].decode()
                buf = buf[pos + 1 :]
                pos = buf.find(b"\n")




@functools.lru_cache(maxsize=256)
def _create_regex(pat: str) -> re.Pattern[str]:
    r"""Create a regex from a pattern, that can include `*` wildcards,
    to match 0 or more characters.

    `\*` is translated as a literal `*`.
    """
    regex = ""
    backslash_last = False
    for char in pat:
        if backslash_last and char == "*":
            regex += re.escape(char)
            backslash_last = False
            continue
        if backslash_last:
            regex += re.escape("\\")
        backslash_last = False
        if char == "\\":
            backslash_last = True
            continue
        if char == "*":
            regex += ".*"
            continue
        regex += re.escape(char)

    return re.compile(regex)



[docs]
def match_with_wildcard(name: str, pattern: str | None) -> bool:
    r"""Match a whole name with a pattern, that can include `*` wildcards,
    to match 0 or more characters.

    To include a literal `*` in the pattern, use `\*`.
    """
    if pattern is None:
        return True
    regex = _create_regex(pattern)
    return regex.fullmatch(name) is not None




[docs]
@dataclass
class InvMatch:
    """A match from an inventory."""

    inv: str
    domain: str
    otype: str
    name: str
    project: str
    version: str
    base_url: str | None
    loc: str
    text: str | None


[docs]
    def asdict(self) -> dict[str, str]:
        return asdict(self)





[docs]
def filter_inventories(
    inventories: dict[str, InventoryType],
    *,
    invs: str | None = None,
    domains: str | None = None,
    otypes: str | None = None,
    targets: str | None = None,
) -> Iterator[InvMatch]:
    r"""Filter a set of inventories.

    Filters are strings that can include `*` wildcards, to match 0 or more characters.
     To include a literal `*` in the pattern, use `\*`.

    :param inventories: Mapping of inventory name to inventory data
    :param invs: the inventory key filter
    :param domains: the domain name filter
    :param otypes: the object type filter
    :param targets: the target name filter
    """
    for inv_name, inv_data in inventories.items():
        if not match_with_wildcard(inv_name, invs):
            continue
        for domain_name, dom_data in inv_data["objects"].items():
            if not match_with_wildcard(domain_name, domains):
                continue
            for obj_type, obj_data in dom_data.items():
                if not match_with_wildcard(obj_type, otypes):
                    continue
                for target, item_data in obj_data.items():
                    if match_with_wildcard(target, targets):
                        yield InvMatch(
                            inv=inv_name,
                            domain=domain_name,
                            otype=obj_type,
                            name=target,
                            project=inv_data["name"],
                            version=inv_data["version"],
                            base_url=inv_data["base_url"],
                            loc=item_data["loc"],
                            text=item_data["text"],
                        )




[docs]
def filter_sphinx_inventories(
    inventories: dict[str, SphinxInventoryType],
    *,
    invs: str | None = None,
    domains: str | None = None,
    otypes: str | None = None,
    targets: str | None = None,
) -> Iterator[InvMatch]:
    r"""Filter a set of sphinx style inventories.

    Filters are strings that can include `*` wildcards, to match 0 or more characters.
     To include a literal `*` in the pattern, use `\*`.

    :param inventories: Mapping of inventory name to inventory data
    :param invs: the inventory key filter
    :param domains: the domain name filter
    :param otypes: the object type filter
    :param targets: the target name filter
    """
    for inv_name, inv_data in inventories.items():
        if not match_with_wildcard(inv_name, invs):
            continue
        for domain_obj_name, data in inv_data.items():
            if ":" not in domain_obj_name:
                continue
            domain_name, obj_type = domain_obj_name.split(":", 1)
            if not (
                match_with_wildcard(domain_name, domains)
                and match_with_wildcard(obj_type, otypes)
            ):
                continue
            for target in data:
                if match_with_wildcard(target, targets):
                    data_target = data[target]
                    if hasattr(data_target, "project_name"):
                        # Sphinx >= 8.2
                        project = data_target.project_name
                        version = data_target.project_version
                        loc = data_target.uri
                        text = data_target.display_name
                    else:
                        project, version, loc, text = data_target
                    yield (
                        InvMatch(
                            inv=inv_name,
                            domain=domain_name,
                            otype=obj_type,
                            name=target,
                            project=project,
                            version=version,
                            base_url=None,
                            loc=loc,
                            text=None if (not text or text == "-") else text,
                        )
                    )




[docs]
def filter_string(
    invs: str | None,
    domains: str | None,
    otype: str | None,
    target: str | None,
    *,
    delimiter: str = ":",
) -> str:
    """Create a string representation of the filter, from the given arguments."""
    str_items = []
    for item in (invs, domains, otype, target):
        if item is None:
            str_items.append("*")
        elif delimiter in item:
            str_items.append(f'"{item}"')
        else:
            str_items.append(f"{item}")
    return delimiter.join(str_items)




[docs]
def fetch_inventory(
    uri: str, *, timeout: None | float = None, base_url: None | str = None
) -> InventoryType:
    """Fetch an inventory from a URL or local path."""
    if uri.startswith(("http://", "https://")):
        with urlopen(uri, timeout=timeout) as stream:
            return load(stream, base_url=base_url)
    with open(uri, "rb") as stream:
        return load(stream, base_url=base_url)




[docs]
def inventory_cli(inputs: None | list[str] = None):
    """Command line interface for fetching and parsing an inventory."""
    parser = argparse.ArgumentParser(description="Parse an inventory file.")
    parser.add_argument("uri", metavar="[URL|PATH]", help="URI of the inventory file")
    parser.add_argument(
        "-d",
        "--domain",
        metavar="DOMAIN",
        default="*",
        help="Filter the inventory by domain (`*` = wildcard)",
    )
    parser.add_argument(
        "-o",
        "--object-type",
        metavar="TYPE",
        default="*",
        help="Filter the inventory by object type (`*` = wildcard)",
    )
    parser.add_argument(
        "-n",
        "--name",
        metavar="NAME",
        default="*",
        help="Filter the inventory by reference name (`*` = wildcard)",
    )
    parser.add_argument(
        "-l",
        "--loc",
        metavar="LOC",
        help="Filter the inventory by reference location (`*` = wildcard)",
    )
    parser.add_argument(
        "-f",
        "--format",
        choices=["yaml", "json"],
        default="yaml",
        help="Output format",
    )
    parser.add_argument(
        "--timeout",
        type=float,
        metavar="SECONDS",
        help="Timeout for fetching the inventory",
    )
    args = parser.parse_args(inputs)

    base_url = None
    if args.uri.startswith("http://") or args.uri.startswith("https://"):
        try:
            with urlopen(args.uri, timeout=args.timeout) as stream:
                invdata = load(stream)
            base_url = args.uri.rsplit("/", 1)[0]
        except Exception:
            with urlopen(args.uri + "/objects.inv", timeout=args.timeout) as stream:
                invdata = load(stream)
            base_url = args.uri
    else:
        with open(args.uri, "rb") as stream:
            invdata = load(stream)

    filtered: InventoryType = {
        "name": invdata["name"],
        "version": invdata["version"],
        "base_url": base_url,
        "objects": {},
    }
    for match in filter_inventories(
        {"": invdata},
        domains=args.domain,
        otypes=args.object_type,
        targets=args.name,
    ):
        if args.loc and not match_with_wildcard(match.loc, args.loc):
            continue
        filtered["objects"].setdefault(match.domain, {}).setdefault(match.otype, {})[
            match.name
        ] = {
            "loc": match.loc,
            "text": match.text,
        }

    if args.format == "json":
        print(json.dumps(filtered, indent=2, sort_keys=False))
    else:
        print(yaml.dump(filtered, sort_keys=False))



if __name__ == "__main__":
    inventory_cli()