"""Logic for dealing with sphinx style inventories (e.g. `objects.inv`).
These contain mappings of reference names to ids, scoped by domain and object type.
This is adapted from the Sphinx inventory.py module.
We replicate it here, so that it can be used without Sphinx.
"""
from __future__ import annotations
import argparse
import functools
import json
import re
import zlib
from collections.abc import Iterator
from dataclasses import asdict, dataclass
from typing import IO, TYPE_CHECKING, TypedDict
from urllib.request import urlopen
import yaml
if TYPE_CHECKING:
# domain_type:object_type -> name -> (project, version, loc, text)
# the `loc` includes the base url, also null `text` is denoted by "-"
from sphinx.util.typing import Inventory as SphinxInventoryType
[docs]
class InventoryItemType(TypedDict):
"""A single inventory item."""
loc: str
"""The location of the item (relative if base_url not None)."""
text: str | None
"""Implicit text to show for the item."""
[docs]
class InventoryType(TypedDict):
"""Inventory data."""
name: str
"""The name of the project."""
version: str
"""The version of the project."""
base_url: str | None
"""The base URL of the `loc`."""
objects: dict[str, dict[str, dict[str, InventoryItemType]]]
"""Mapping of domain -> object type -> name -> item."""
[docs]
def from_sphinx(inv: SphinxInventoryType) -> InventoryType:
"""Convert from a Sphinx compliant format."""
project = ""
version = ""
objs: dict[str, dict[str, dict[str, InventoryItemType]]] = {}
for domain_obj_name, data in inv.items():
if ":" not in domain_obj_name:
continue
domain_name, obj_type = domain_obj_name.split(":", 1)
objs.setdefault(domain_name, {}).setdefault(obj_type, {})
for refname, refdata in data.items():
project, version, uri, text = refdata
objs[domain_name][obj_type][refname] = {
"loc": uri,
"text": None if (not text or text == "-") else text,
}
return {
"name": project,
"version": version,
"base_url": None,
"objects": objs,
}
[docs]
def to_sphinx(inv: InventoryType) -> SphinxInventoryType:
"""Convert to a Sphinx compliant format."""
objs: SphinxInventoryType = {}
for domain_name, obj_types in inv["objects"].items():
for obj_type, refs in obj_types.items():
for refname, refdata in refs.items():
objs.setdefault(f"{domain_name}:{obj_type}", {})[refname] = (
inv["name"],
inv["version"],
refdata["loc"],
refdata["text"] or "-",
)
return objs
[docs]
def load(stream: IO, base_url: str | None = None) -> InventoryType:
"""Load inventory data from a stream."""
reader = InventoryFileReader(stream)
line = reader.readline().rstrip()
if line == "# Sphinx inventory version 1":
return _load_v1(reader, base_url)
elif line == "# Sphinx inventory version 2":
return _load_v2(reader, base_url)
else:
raise ValueError(f"invalid inventory header: {line}")
def _load_v1(stream: InventoryFileReader, base_url: str | None) -> InventoryType:
"""Load inventory data (format v1) from a stream."""
projname = stream.readline().rstrip()[11:]
version = stream.readline().rstrip()[11:]
invdata: InventoryType = {
"name": projname,
"version": version,
"base_url": base_url,
"objects": {},
}
for line in stream.readlines():
name, objtype, location = line.rstrip().split(None, 2)
# version 1 did not add anchors to the location
domain = "py"
if objtype == "mod":
objtype = "module"
location += "#module-" + name
else:
location += "#" + name
invdata["objects"].setdefault(domain, {}).setdefault(objtype, {})
invdata["objects"][domain][objtype][name] = {"loc": location, "text": None}
return invdata
def _load_v2(stream: InventoryFileReader, base_url: str | None) -> InventoryType:
"""Load inventory data (format v2) from a stream."""
projname = stream.readline().rstrip()[11:]
version = stream.readline().rstrip()[11:]
invdata: InventoryType = {
"name": projname,
"version": version,
"base_url": base_url,
"objects": {},
}
line = stream.readline()
if "zlib" not in line:
raise ValueError(f"invalid inventory header (not compressed): {line}")
for line in stream.read_compressed_lines():
# be careful to handle names with embedded spaces correctly
m = re.match(r"(?x)(.+?)\s+(\S+)\s+(-?\d+)\s+?(\S*)\s+(.*)", line.rstrip())
if not m:
continue
name: str
type: str
name, type, _, location, text = m.groups()
if ":" not in type:
# wrong type value. type should be in the form of "{domain}:{objtype}"
#
# Note: To avoid the regex DoS, this is implemented in python (refs: #8175)
continue
if (
type == "py:module"
and type in invdata["objects"]
and name in invdata["objects"][type]
):
# due to a bug in 1.1 and below,
# two inventory entries are created
# for Python modules, and the first
# one is correct
continue
if location.endswith("$"):
location = location[:-1] + name
domain, objtype = type.split(":", 1)
invdata["objects"].setdefault(domain, {}).setdefault(objtype, {})
if not text or text == "-":
text = None
invdata["objects"][domain][objtype][name] = {"loc": location, "text": text}
return invdata
_BUFSIZE = 16 * 1024
[docs]
class InventoryFileReader:
"""A file reader for an inventory file.
This reader supports mixture of texts and compressed texts.
"""
def __init__(self, stream: IO) -> None:
self.stream = stream
self.buffer = b""
self.eof = False
[docs]
def read_buffer(self) -> None:
chunk = self.stream.read(_BUFSIZE)
if chunk == b"":
self.eof = True
self.buffer += chunk
[docs]
def readline(self) -> str:
pos = self.buffer.find(b"\n")
if pos != -1:
line = self.buffer[:pos].decode()
self.buffer = self.buffer[pos + 1 :]
elif self.eof:
line = self.buffer.decode()
self.buffer = b""
else:
self.read_buffer()
line = self.readline()
return line
[docs]
def readlines(self) -> Iterator[str]:
while not self.eof:
line = self.readline()
if line:
yield line
[docs]
def read_compressed_chunks(self) -> Iterator[bytes]:
decompressor = zlib.decompressobj()
while not self.eof:
self.read_buffer()
yield decompressor.decompress(self.buffer)
self.buffer = b""
yield decompressor.flush()
[docs]
def read_compressed_lines(self) -> Iterator[str]:
buf = b""
for chunk in self.read_compressed_chunks():
buf += chunk
pos = buf.find(b"\n")
while pos != -1:
yield buf[:pos].decode()
buf = buf[pos + 1 :]
pos = buf.find(b"\n")
@functools.lru_cache(maxsize=256)
def _create_regex(pat: str) -> re.Pattern[str]:
r"""Create a regex from a pattern, that can include `*` wildcards,
to match 0 or more characters.
`\*` is translated as a literal `*`.
"""
regex = ""
backslash_last = False
for char in pat:
if backslash_last and char == "*":
regex += re.escape(char)
backslash_last = False
continue
if backslash_last:
regex += re.escape("\\")
backslash_last = False
if char == "\\":
backslash_last = True
continue
if char == "*":
regex += ".*"
continue
regex += re.escape(char)
return re.compile(regex)
[docs]
def match_with_wildcard(name: str, pattern: str | None) -> bool:
r"""Match a whole name with a pattern, that can include `*` wildcards,
to match 0 or more characters.
To include a literal `*` in the pattern, use `\*`.
"""
if pattern is None:
return True
regex = _create_regex(pattern)
return regex.fullmatch(name) is not None
[docs]
@dataclass
class InvMatch:
"""A match from an inventory."""
inv: str
domain: str
otype: str
name: str
project: str
version: str
base_url: str | None
loc: str
text: str | None
[docs]
def asdict(self) -> dict[str, str]:
return asdict(self)
[docs]
def filter_inventories(
inventories: dict[str, InventoryType],
*,
invs: str | None = None,
domains: str | None = None,
otypes: str | None = None,
targets: str | None = None,
) -> Iterator[InvMatch]:
r"""Filter a set of inventories.
Filters are strings that can include `*` wildcards, to match 0 or more characters.
To include a literal `*` in the pattern, use `\*`.
:param inventories: Mapping of inventory name to inventory data
:param invs: the inventory key filter
:param domains: the domain name filter
:param otypes: the object type filter
:param targets: the target name filter
"""
for inv_name, inv_data in inventories.items():
if not match_with_wildcard(inv_name, invs):
continue
for domain_name, dom_data in inv_data["objects"].items():
if not match_with_wildcard(domain_name, domains):
continue
for obj_type, obj_data in dom_data.items():
if not match_with_wildcard(obj_type, otypes):
continue
for target, item_data in obj_data.items():
if match_with_wildcard(target, targets):
yield InvMatch(
inv=inv_name,
domain=domain_name,
otype=obj_type,
name=target,
project=inv_data["name"],
version=inv_data["version"],
base_url=inv_data["base_url"],
loc=item_data["loc"],
text=item_data["text"],
)
[docs]
def filter_sphinx_inventories(
inventories: dict[str, SphinxInventoryType],
*,
invs: str | None = None,
domains: str | None = None,
otypes: str | None = None,
targets: str | None = None,
) -> Iterator[InvMatch]:
r"""Filter a set of sphinx style inventories.
Filters are strings that can include `*` wildcards, to match 0 or more characters.
To include a literal `*` in the pattern, use `\*`.
:param inventories: Mapping of inventory name to inventory data
:param invs: the inventory key filter
:param domains: the domain name filter
:param otypes: the object type filter
:param targets: the target name filter
"""
for inv_name, inv_data in inventories.items():
if not match_with_wildcard(inv_name, invs):
continue
for domain_obj_name, data in inv_data.items():
if ":" not in domain_obj_name:
continue
domain_name, obj_type = domain_obj_name.split(":", 1)
if not (
match_with_wildcard(domain_name, domains)
and match_with_wildcard(obj_type, otypes)
):
continue
for target in data:
if match_with_wildcard(target, targets):
project, version, loc, text = data[target]
yield (
InvMatch(
inv=inv_name,
domain=domain_name,
otype=obj_type,
name=target,
project=project,
version=version,
base_url=None,
loc=loc,
text=None if (not text or text == "-") else text,
)
)
[docs]
def filter_string(
invs: str | None,
domains: str | None,
otype: str | None,
target: str | None,
*,
delimiter: str = ":",
) -> str:
"""Create a string representation of the filter, from the given arguments."""
str_items = []
for item in (invs, domains, otype, target):
if item is None:
str_items.append("*")
elif delimiter in item:
str_items.append(f'"{item}"')
else:
str_items.append(f"{item}")
return delimiter.join(str_items)
[docs]
def fetch_inventory(
uri: str, *, timeout: None | float = None, base_url: None | str = None
) -> InventoryType:
"""Fetch an inventory from a URL or local path."""
if uri.startswith(("http://", "https://")):
with urlopen(uri, timeout=timeout) as stream:
return load(stream, base_url=base_url)
with open(uri, "rb") as stream:
return load(stream, base_url=base_url)
[docs]
def inventory_cli(inputs: None | list[str] = None):
"""Command line interface for fetching and parsing an inventory."""
parser = argparse.ArgumentParser(description="Parse an inventory file.")
parser.add_argument("uri", metavar="[URL|PATH]", help="URI of the inventory file")
parser.add_argument(
"-d",
"--domain",
metavar="DOMAIN",
default="*",
help="Filter the inventory by domain (`*` = wildcard)",
)
parser.add_argument(
"-o",
"--object-type",
metavar="TYPE",
default="*",
help="Filter the inventory by object type (`*` = wildcard)",
)
parser.add_argument(
"-n",
"--name",
metavar="NAME",
default="*",
help="Filter the inventory by reference name (`*` = wildcard)",
)
parser.add_argument(
"-l",
"--loc",
metavar="LOC",
help="Filter the inventory by reference location (`*` = wildcard)",
)
parser.add_argument(
"-f",
"--format",
choices=["yaml", "json"],
default="yaml",
help="Output format",
)
parser.add_argument(
"--timeout",
type=float,
metavar="SECONDS",
help="Timeout for fetching the inventory",
)
args = parser.parse_args(inputs)
base_url = None
if args.uri.startswith("http://") or args.uri.startswith("https://"):
try:
with urlopen(args.uri, timeout=args.timeout) as stream:
invdata = load(stream)
base_url = args.uri.rsplit("/", 1)[0]
except Exception:
with urlopen(args.uri + "/objects.inv", timeout=args.timeout) as stream:
invdata = load(stream)
base_url = args.uri
else:
with open(args.uri, "rb") as stream:
invdata = load(stream)
filtered: InventoryType = {
"name": invdata["name"],
"version": invdata["version"],
"base_url": base_url,
"objects": {},
}
for match in filter_inventories(
{"": invdata},
domains=args.domain,
otypes=args.object_type,
targets=args.name,
):
if args.loc and not match_with_wildcard(match.loc, args.loc):
continue
filtered["objects"].setdefault(match.domain, {}).setdefault(match.otype, {})[
match.name
] = {
"loc": match.loc,
"text": match.text,
}
if args.format == "json":
print(json.dumps(filtered, indent=2, sort_keys=False))
else:
print(yaml.dump(filtered, sort_keys=False))
if __name__ == "__main__":
inventory_cli()