From 293f5887b7baa1a2d1f48e9e102eebbf893e6709 Mon Sep 17 00:00:00 2001 From: Rohan McGovern Date: Tue, 9 Aug 2022 08:49:14 +1000 Subject: [PATCH] Implement error handling Ultimately, all errors are propagated in some way, but it's important to differentiate between "the content was invalid" vs "failed to fetch the content". --- repo_autoindex/__init__.py | 7 +- repo_autoindex/_impl/api.py | 47 +++++++++-- repo_autoindex/_impl/base.py | 16 ++++ repo_autoindex/_impl/yum.py | 28 ++++--- tests/test_yum_render_corrupt.py | 132 +++++++++++++++++++++++++++++++ 5 files changed, 209 insertions(+), 21 deletions(-) create mode 100644 tests/test_yum_render_corrupt.py diff --git a/repo_autoindex/__init__.py b/repo_autoindex/__init__.py index b472b0b..77fcc0c 100644 --- a/repo_autoindex/__init__.py +++ b/repo_autoindex/__init__.py @@ -1,4 +1,7 @@ from ._impl.api import autoindex -from ._impl.base import Fetcher, GeneratedIndex +from ._impl.base import Fetcher, GeneratedIndex, ContentError -__all__ = ["autoindex", "Fetcher", "GeneratedIndex"] +ContentError.__module__ = "repo_autoindex" + + +__all__ = ["autoindex", "ContentError", "Fetcher", "GeneratedIndex"] diff --git a/repo_autoindex/_impl/api.py b/repo_autoindex/_impl/api.py index 12d8df2..b094190 100644 --- a/repo_autoindex/_impl/api.py +++ b/repo_autoindex/_impl/api.py @@ -5,7 +5,7 @@ from typing import Optional, Type import aiohttp -from .base import Fetcher, GeneratedIndex, Repo +from .base import Fetcher, GeneratedIndex, Repo, ContentError, FetcherError from .yum import YumRepo from .pulp import PulpFileRepo @@ -40,6 +40,17 @@ def http_fetcher(session: aiohttp.ClientSession) -> Fetcher: return get_content_with_session +def with_error_handling(fetcher: Fetcher) -> Fetcher: + # wraps a fetcher such that any raised exceptions are wrapped into FetcherError + async def new_fetcher(url: str) -> Optional[str]: + try: + return await fetcher(url) + except Exception as exc: + raise FetcherError from exc + + return new_fetcher + + async def autoindex( url: str, *, @@ -91,6 +102,15 @@ async def autoindex( Zero indexes may be produced if the given URL doesn't represent a repository of any supported type. + + Raises: + :class:`ContentError` + Raised if indexed content appears to be invalid (for example, a yum repository + has invalid repodata). + + :class:`Exception` + Any exception raised by ``fetcher`` will propagate (for example, I/O errors or + HTTP request failures). """ if fetcher is None: async with aiohttp.ClientSession() as session: @@ -103,8 +123,23 @@ async def autoindex( while url.endswith("/"): url = url[:-1] - for repo_type in REPO_TYPES: - repo = await repo_type.probe(fetcher, url) - if repo: - async for page in repo.render_index(index_href_suffix=index_href_suffix): - yield page + fetcher = with_error_handling(fetcher) + + try: + for repo_type in REPO_TYPES: + repo = await repo_type.probe(fetcher, url) + if repo: + async for page in repo.render_index( + index_href_suffix=index_href_suffix + ): + yield page + except FetcherError as exc: + # FetcherErrors are unwrapped to propagate whatever was the original error + assert exc.__cause__ + raise exc.__cause__ from None + except ContentError: + # explicitly raised ContentErrors are allowed to propagate + raise + except Exception as exc: + # Any other errors are treated as a ContentError + raise ContentError(f"Invalid content found at {url}") from exc diff --git a/repo_autoindex/_impl/base.py b/repo_autoindex/_impl/base.py index a7af5e9..e9954a7 100644 --- a/repo_autoindex/_impl/base.py +++ b/repo_autoindex/_impl/base.py @@ -15,6 +15,22 @@ ICON_QCOW = "🐮" ICON_OTHER = " " +class ContentError(Exception): + """An error raised when indexed content appears to be invalid. + + Errors of this type are raised when repo-autoindex is able to successfully + retrieve content and determine a repository type but fails to parse + repository metadata. For example, a corrupt yum repository may cause this + error to be raised. + """ + + +class FetcherError(Exception): + # Internal-only error used to separate exceptions raised by fetchers from + # exceptions raised by anything else. + pass + + @dataclass class GeneratedIndex: """A single HTML index page generated by repo-autoindex.""" diff --git a/repo_autoindex/_impl/yum.py b/repo_autoindex/_impl/yum.py index 3d3366f..c46ccfa 100644 --- a/repo_autoindex/_impl/yum.py +++ b/repo_autoindex/_impl/yum.py @@ -3,30 +3,35 @@ import logging import os from collections.abc import AsyncGenerator, Generator, Iterable from dataclasses import dataclass -from typing import Optional, Type +from typing import Optional, Type, Any, TypeVar, NoReturn, overload from xml.dom.minidom import Element -from xml.dom.pulldom import END_ELEMENT, START_ELEMENT, DOMEventStream +from xml.dom.pulldom import END_ELEMENT, START_ELEMENT from defusedxml import pulldom # type: ignore -from .base import ICON_FOLDER, ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo +from .base import ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo, ContentError from .template import TemplateContext from .tree import treeify LOG = logging.getLogger("autoindex") +def assert_repodata_ok(condition: Any, msg: str): + if not condition: + raise ContentError(msg) + + def get_tag(elem: Element, name: str) -> Element: elems: list[Element] = elem.getElementsByTagName(name) # type: ignore + assert_repodata_ok(len(elems) == 1, f"expected exactly one {name} tag") return elems[0] def get_text_tag(elem: Element, name: str) -> str: tagnode = get_tag(elem, name) child = tagnode.firstChild - # TODO: raise proper error if missing - assert child - return str(child.toxml()) + assert_repodata_ok(child, f"missing text {name} tag") + return str(child.toxml()) # type: ignore @dataclass @@ -39,7 +44,6 @@ class Package: def from_element(cls, elem: Element) -> "Package": return cls( href=get_tag(elem, "location").attributes["href"].value, - # TODO: tolerate some of these being absent or wrong. time=get_tag(elem, "time").attributes["file"].value, size=get_tag(elem, "size").attributes["package"].value, ) @@ -104,10 +108,9 @@ class YumRepo(Repo): ) if len(revision_nodes) == 1: timestamp_node = revision_nodes[0].firstChild - # TODO: raise proper error - assert timestamp_node + assert_repodata_ok(timestamp_node, "missing timestamp node") time = datetime.datetime.utcfromtimestamp( - int(timestamp_node.toxml()) + int(timestamp_node.toxml()) # type: ignore ).isoformat() out.append( @@ -165,11 +168,10 @@ class YumRepo(Repo): primary_url = "/".join([self.base_url, href]) primary_xml = await self.fetcher(primary_url) - # TODO: raise proper error if missing - assert primary_xml + assert_repodata_ok(primary_xml, f"missing primary XML at {primary_url}") return sorted( - [p.index_entry for p in self.__packages_from_primary(primary_xml)], + [p.index_entry for p in self.__packages_from_primary(primary_xml)], # type: ignore key=lambda e: e.text, ) diff --git a/tests/test_yum_render_corrupt.py b/tests/test_yum_render_corrupt.py new file mode 100644 index 0000000..7a7c567 --- /dev/null +++ b/tests/test_yum_render_corrupt.py @@ -0,0 +1,132 @@ +from typing import Optional +import textwrap + +from repo_autoindex import autoindex, ContentError + +REPOMD_XML = textwrap.dedent( + """ + + + 1657165688 + + d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf + 6fc4eddd4e9de89246efba3815b8a9dec9dfe168e4fd3104cc792dff908a0f62 + + 1657165688 + 2932 + 16585 + + + 284769ec79daa9e0a3b0129bb6260cc6271c90c4fe02b43dfa7cdf7635fb803f + 72f89223c8b0f6c7a2ee6ed7fbd16ee0bb395ca68260038bb3895265af84c29f + + 1657165688 + 4621 + 36911 + + + 36c2195bbee0c39ee080969abc6fd59d943c3471114cfd43c6e776ac20d7ed21 + 39f52cf295db14e863abcd7b2eede8e6c5e39ac9b2f194349459d29cd492c90f + + 1657165688 + 1408 + 8432 + + + 55e6bfd00e889c5c1f9a3c9fb35a660158bc5d975ae082d434f3cf81cc2c0c21 + b2692c49d1d98d68e764e29108d8a81a3dfd9e04fa7665115853a029396d118d + + 1657165688 + 7609 + 114688 + 10 + + + de63a509812c37f7736fcef0b79e9c55dfe67a2d77006f74fdc442935103e9e6 + 40eb5d53fe547c98d470813256c9bfc8a239b13697d8eb824a1485c9e186a0e3 + + 1657165688 + 10323 + 65536 + 10 + + + 9aa39b62df200cb3784dea24092d0c1c686afff0cd0990c2ec7a61afe8896e1c + 3e5cefb10ce805b827e12ca3b4839bba873dc9403fd92b60a364bf6f312bd972 + + 1657165688 + 2758 + 32768 + 10 + + +""" +).strip() + +PRIMARY_XML = textwrap.dedent( + """ + + + + +""" +).strip() + + +class StaticFetcher: + def __init__(self): + self.content: dict[str, str] = {} + + async def __call__(self, url: str) -> Optional[str]: + return self.content.get(url) + + +async def test_corrupt_repodata(): + fetcher = StaticFetcher() + + fetcher.content["https://example.com/repodata/repomd.xml"] = REPOMD_XML + fetcher.content[ + "https://example.com/repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz" + ] = PRIMARY_XML + + error = None + try: + async for _ in autoindex("https://example.com", fetcher=fetcher): + pass + except ContentError as exc: + error = exc + + # It should have raised a ContentError + assert error + + # It should summarize + assert "Invalid content found at https://example.com" in str(error) + + # We don't want the test to depend on precise details, but it should have + # some cause coming from the XML parser + assert "xml" in error.__cause__.__module__ + + +async def test_missing_primary(): + fetcher = StaticFetcher() + + fetcher.content["https://example.com/repodata/repomd.xml"] = REPOMD_XML + + error = None + try: + async for _ in autoindex("https://example.com", fetcher=fetcher): + pass + except ContentError as exc: + error = exc + + # It should have raised a ContentError + assert error + + # It should state the reason + assert ( + "missing primary XML at https://example.com/repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz" + in str(error) + ) + + # This one doesn't have a separate cause as it was raised explicitly by our code + assert not error.__cause__