Implement error handling

Ultimately, all errors are propagated in some way, but it's important to
differentiate between "the content was invalid" vs "failed to fetch the
content".
This commit is contained in:
Rohan McGovern 2022-08-09 08:49:14 +10:00
parent 74f657c79e
commit 293f5887b7
5 changed files with 209 additions and 21 deletions

View file

@ -1,4 +1,7 @@
from ._impl.api import autoindex
from ._impl.base import Fetcher, GeneratedIndex
from ._impl.base import Fetcher, GeneratedIndex, ContentError
__all__ = ["autoindex", "Fetcher", "GeneratedIndex"]
ContentError.__module__ = "repo_autoindex"
__all__ = ["autoindex", "ContentError", "Fetcher", "GeneratedIndex"]

View file

@ -5,7 +5,7 @@ from typing import Optional, Type
import aiohttp
from .base import Fetcher, GeneratedIndex, Repo
from .base import Fetcher, GeneratedIndex, Repo, ContentError, FetcherError
from .yum import YumRepo
from .pulp import PulpFileRepo
@ -40,6 +40,17 @@ def http_fetcher(session: aiohttp.ClientSession) -> Fetcher:
return get_content_with_session
def with_error_handling(fetcher: Fetcher) -> Fetcher:
# wraps a fetcher such that any raised exceptions are wrapped into FetcherError
async def new_fetcher(url: str) -> Optional[str]:
try:
return await fetcher(url)
except Exception as exc:
raise FetcherError from exc
return new_fetcher
async def autoindex(
url: str,
*,
@ -91,6 +102,15 @@ async def autoindex(
Zero indexes may be produced if the given URL doesn't represent a repository
of any supported type.
Raises:
:class:`ContentError`
Raised if indexed content appears to be invalid (for example, a yum repository
has invalid repodata).
:class:`Exception`
Any exception raised by ``fetcher`` will propagate (for example, I/O errors or
HTTP request failures).
"""
if fetcher is None:
async with aiohttp.ClientSession() as session:
@ -103,8 +123,23 @@ async def autoindex(
while url.endswith("/"):
url = url[:-1]
for repo_type in REPO_TYPES:
repo = await repo_type.probe(fetcher, url)
if repo:
async for page in repo.render_index(index_href_suffix=index_href_suffix):
yield page
fetcher = with_error_handling(fetcher)
try:
for repo_type in REPO_TYPES:
repo = await repo_type.probe(fetcher, url)
if repo:
async for page in repo.render_index(
index_href_suffix=index_href_suffix
):
yield page
except FetcherError as exc:
# FetcherErrors are unwrapped to propagate whatever was the original error
assert exc.__cause__
raise exc.__cause__ from None
except ContentError:
# explicitly raised ContentErrors are allowed to propagate
raise
except Exception as exc:
# Any other errors are treated as a ContentError
raise ContentError(f"Invalid content found at {url}") from exc

View file

@ -15,6 +15,22 @@ ICON_QCOW = "🐮"
ICON_OTHER = " "
class ContentError(Exception):
"""An error raised when indexed content appears to be invalid.
Errors of this type are raised when repo-autoindex is able to successfully
retrieve content and determine a repository type but fails to parse
repository metadata. For example, a corrupt yum repository may cause this
error to be raised.
"""
class FetcherError(Exception):
# Internal-only error used to separate exceptions raised by fetchers from
# exceptions raised by anything else.
pass
@dataclass
class GeneratedIndex:
"""A single HTML index page generated by repo-autoindex."""

View file

@ -3,30 +3,35 @@ import logging
import os
from collections.abc import AsyncGenerator, Generator, Iterable
from dataclasses import dataclass
from typing import Optional, Type
from typing import Optional, Type, Any, TypeVar, NoReturn, overload
from xml.dom.minidom import Element
from xml.dom.pulldom import END_ELEMENT, START_ELEMENT, DOMEventStream
from xml.dom.pulldom import END_ELEMENT, START_ELEMENT
from defusedxml import pulldom # type: ignore
from .base import ICON_FOLDER, ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo
from .base import ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo, ContentError
from .template import TemplateContext
from .tree import treeify
LOG = logging.getLogger("autoindex")
def assert_repodata_ok(condition: Any, msg: str):
if not condition:
raise ContentError(msg)
def get_tag(elem: Element, name: str) -> Element:
elems: list[Element] = elem.getElementsByTagName(name) # type: ignore
assert_repodata_ok(len(elems) == 1, f"expected exactly one {name} tag")
return elems[0]
def get_text_tag(elem: Element, name: str) -> str:
tagnode = get_tag(elem, name)
child = tagnode.firstChild
# TODO: raise proper error if missing
assert child
return str(child.toxml())
assert_repodata_ok(child, f"missing text {name} tag")
return str(child.toxml()) # type: ignore
@dataclass
@ -39,7 +44,6 @@ class Package:
def from_element(cls, elem: Element) -> "Package":
return cls(
href=get_tag(elem, "location").attributes["href"].value,
# TODO: tolerate some of these being absent or wrong.
time=get_tag(elem, "time").attributes["file"].value,
size=get_tag(elem, "size").attributes["package"].value,
)
@ -104,10 +108,9 @@ class YumRepo(Repo):
)
if len(revision_nodes) == 1:
timestamp_node = revision_nodes[0].firstChild
# TODO: raise proper error
assert timestamp_node
assert_repodata_ok(timestamp_node, "missing timestamp node")
time = datetime.datetime.utcfromtimestamp(
int(timestamp_node.toxml())
int(timestamp_node.toxml()) # type: ignore
).isoformat()
out.append(
@ -165,11 +168,10 @@ class YumRepo(Repo):
primary_url = "/".join([self.base_url, href])
primary_xml = await self.fetcher(primary_url)
# TODO: raise proper error if missing
assert primary_xml
assert_repodata_ok(primary_xml, f"missing primary XML at {primary_url}")
return sorted(
[p.index_entry for p in self.__packages_from_primary(primary_xml)],
[p.index_entry for p in self.__packages_from_primary(primary_xml)], # type: ignore
key=lambda e: e.text,
)

View file

@ -0,0 +1,132 @@
from typing import Optional
import textwrap
from repo_autoindex import autoindex, ContentError
REPOMD_XML = textwrap.dedent(
"""
<?xml version="1.0" encoding="UTF-8"?>
<repomd xmlns="http://linux.duke.edu/metadata/repo" xmlns:rpm="http://linux.duke.edu/metadata/rpm">
<revision>1657165688</revision>
<data type="primary">
<checksum type="sha256">d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf</checksum>
<open-checksum type="sha256">6fc4eddd4e9de89246efba3815b8a9dec9dfe168e4fd3104cc792dff908a0f62</open-checksum>
<location href="repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"/>
<timestamp>1657165688</timestamp>
<size>2932</size>
<open-size>16585</open-size>
</data>
<data type="filelists">
<checksum type="sha256">284769ec79daa9e0a3b0129bb6260cc6271c90c4fe02b43dfa7cdf7635fb803f</checksum>
<open-checksum type="sha256">72f89223c8b0f6c7a2ee6ed7fbd16ee0bb395ca68260038bb3895265af84c29f</open-checksum>
<location href="repodata/284769ec79daa9e0a3b0129bb6260cc6271c90c4fe02b43dfa7cdf7635fb803f-filelists.xml.gz"/>
<timestamp>1657165688</timestamp>
<size>4621</size>
<open-size>36911</open-size>
</data>
<data type="other">
<checksum type="sha256">36c2195bbee0c39ee080969abc6fd59d943c3471114cfd43c6e776ac20d7ed21</checksum>
<open-checksum type="sha256">39f52cf295db14e863abcd7b2eede8e6c5e39ac9b2f194349459d29cd492c90f</open-checksum>
<location href="repodata/36c2195bbee0c39ee080969abc6fd59d943c3471114cfd43c6e776ac20d7ed21-other.xml.gz"/>
<timestamp>1657165688</timestamp>
<size>1408</size>
<open-size>8432</open-size>
</data>
<data type="primary_db">
<checksum type="sha256">55e6bfd00e889c5c1f9a3c9fb35a660158bc5d975ae082d434f3cf81cc2c0c21</checksum>
<open-checksum type="sha256">b2692c49d1d98d68e764e29108d8a81a3dfd9e04fa7665115853a029396d118d</open-checksum>
<location href="repodata/55e6bfd00e889c5c1f9a3c9fb35a660158bc5d975ae082d434f3cf81cc2c0c21-primary.sqlite.bz2"/>
<timestamp>1657165688</timestamp>
<size>7609</size>
<open-size>114688</open-size>
<database_version>10</database_version>
</data>
<data type="filelists_db">
<checksum type="sha256">de63a509812c37f7736fcef0b79e9c55dfe67a2d77006f74fdc442935103e9e6</checksum>
<open-checksum type="sha256">40eb5d53fe547c98d470813256c9bfc8a239b13697d8eb824a1485c9e186a0e3</open-checksum>
<location href="repodata/de63a509812c37f7736fcef0b79e9c55dfe67a2d77006f74fdc442935103e9e6-filelists.sqlite.bz2"/>
<timestamp>1657165688</timestamp>
<size>10323</size>
<open-size>65536</open-size>
<database_version>10</database_version>
</data>
<data type="other_db">
<checksum type="sha256">9aa39b62df200cb3784dea24092d0c1c686afff0cd0990c2ec7a61afe8896e1c</checksum>
<open-checksum type="sha256">3e5cefb10ce805b827e12ca3b4839bba873dc9403fd92b60a364bf6f312bd972</open-checksum>
<location href="repodata/9aa39b62df200cb3784dea24092d0c1c686afff0cd0990c2ec7a61afe8896e1c-other.sqlite.bz2"/>
<timestamp>1657165688</timestamp>
<size>2758</size>
<open-size>32768</open-size>
<database_version>10</database_version>
</data>
</repomd>
"""
).strip()
PRIMARY_XML = textwrap.dedent(
"""
<?xml version="1.0" encoding="UTF-8"?>
<metadata xmlns="http://linux.duke.edu/metadata/common" xmlns:rpm="http://linux.duke.edu/metadata/rpm" packages="5">
<package type="rpm">
<name>
"""
).strip()
class StaticFetcher:
def __init__(self):
self.content: dict[str, str] = {}
async def __call__(self, url: str) -> Optional[str]:
return self.content.get(url)
async def test_corrupt_repodata():
fetcher = StaticFetcher()
fetcher.content["https://example.com/repodata/repomd.xml"] = REPOMD_XML
fetcher.content[
"https://example.com/repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"
] = PRIMARY_XML
error = None
try:
async for _ in autoindex("https://example.com", fetcher=fetcher):
pass
except ContentError as exc:
error = exc
# It should have raised a ContentError
assert error
# It should summarize
assert "Invalid content found at https://example.com" in str(error)
# We don't want the test to depend on precise details, but it should have
# some cause coming from the XML parser
assert "xml" in error.__cause__.__module__
async def test_missing_primary():
fetcher = StaticFetcher()
fetcher.content["https://example.com/repodata/repomd.xml"] = REPOMD_XML
error = None
try:
async for _ in autoindex("https://example.com", fetcher=fetcher):
pass
except ContentError as exc:
error = exc
# It should have raised a ContentError
assert error
# It should state the reason
assert (
"missing primary XML at https://example.com/repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"
in str(error)
)
# This one doesn't have a separate cause as it was raised explicitly by our code
assert not error.__cause__