Implement error handling

Ultimately, all errors are propagated in some way, but it's important to differentiate between "the content was invalid" vs "failed to fetch the content".
2025-02-23 13:42:52 +00:00 · 2022-08-09 08:49:14 +10:00 · 2022-08-09 08:49:14 +10:00 · 293f5887b7
commit 293f5887b7
parent 74f657c79e
5 changed files with 209 additions and 21 deletions
--- a/repo_autoindex/init.py
+++ b/repo_autoindex/init.py
@ -1,4 +1,7 @@
 from ._impl.api import autoindex
-from ._impl.base import Fetcher, GeneratedIndex
+from ._impl.base import Fetcher, GeneratedIndex, ContentError

-__all__ = ["autoindex", "Fetcher", "GeneratedIndex"]
+ContentError.__module__ = "repo_autoindex"
+
+
+__all__ = ["autoindex", "ContentError", "Fetcher", "GeneratedIndex"]
--- a/repo_autoindex/_impl/api.py
+++ b/repo_autoindex/_impl/api.py
@ -5,7 +5,7 @@ from typing import Optional, Type

 import aiohttp

-from .base import Fetcher, GeneratedIndex, Repo
+from .base import Fetcher, GeneratedIndex, Repo, ContentError, FetcherError
 from .yum import YumRepo
 from .pulp import PulpFileRepo

@ -40,6 +40,17 @@ def http_fetcher(session: aiohttp.ClientSession) -> Fetcher:
    return get_content_with_session


+def with_error_handling(fetcher: Fetcher) -> Fetcher:
+    # wraps a fetcher such that any raised exceptions are wrapped into FetcherError
+    async def new_fetcher(url: str) -> Optional[str]:
+        try:
+            return await fetcher(url)
+        except Exception as exc:
+            raise FetcherError from exc
+
+    return new_fetcher
+
+
 async def autoindex(
    url: str,
    *,
@ -91,6 +102,15 @@ async def autoindex(

        Zero indexes may be produced if the given URL doesn't represent a repository
        of any supported type.
+
+    Raises:
+        :class:`ContentError`
+            Raised if indexed content appears to be invalid (for example, a yum repository
+            has invalid repodata).
+
+        :class:`Exception`
+            Any exception raised by ``fetcher`` will propagate (for example, I/O errors or
+            HTTP request failures).
    """
    if fetcher is None:
        async with aiohttp.ClientSession() as session:
@ -103,8 +123,23 @@ async def autoindex(
    while url.endswith("/"):
        url = url[:-1]

-    for repo_type in REPO_TYPES:
-        repo = await repo_type.probe(fetcher, url)
-        if repo:
-            async for page in repo.render_index(index_href_suffix=index_href_suffix):
-                yield page
+    fetcher = with_error_handling(fetcher)
+
+    try:
+        for repo_type in REPO_TYPES:
+            repo = await repo_type.probe(fetcher, url)
+            if repo:
+                async for page in repo.render_index(
+                    index_href_suffix=index_href_suffix
+                ):
+                    yield page
+    except FetcherError as exc:
+        # FetcherErrors are unwrapped to propagate whatever was the original error
+        assert exc.__cause__
+        raise exc.__cause__ from None
+    except ContentError:
+        # explicitly raised ContentErrors are allowed to propagate
+        raise
+    except Exception as exc:
+        # Any other errors are treated as a ContentError
+        raise ContentError(f"Invalid content found at {url}") from exc
--- a/repo_autoindex/_impl/base.py
+++ b/repo_autoindex/_impl/base.py
@ -15,6 +15,22 @@ ICON_QCOW = "🐮"
 ICON_OTHER = "  "


+class ContentError(Exception):
+    """An error raised when indexed content appears to be invalid.
+
+    Errors of this type are raised when repo-autoindex is able to successfully
+    retrieve content and determine a repository type but fails to parse
+    repository metadata. For example, a corrupt yum repository may cause this
+    error to be raised.
+    """
+
+
+class FetcherError(Exception):
+    # Internal-only error used to separate exceptions raised by fetchers from
+    # exceptions raised by anything else.
+    pass
+
+
@dataclass
 class GeneratedIndex:
    """A single HTML index page generated by repo-autoindex."""
--- a/repo_autoindex/_impl/yum.py
+++ b/repo_autoindex/_impl/yum.py
@ -3,30 +3,35 @@ import logging
 import os
 from collections.abc import AsyncGenerator, Generator, Iterable
 from dataclasses import dataclass
-from typing import Optional, Type
+from typing import Optional, Type, Any, TypeVar, NoReturn, overload
 from xml.dom.minidom import Element
-from xml.dom.pulldom import END_ELEMENT, START_ELEMENT, DOMEventStream
+from xml.dom.pulldom import END_ELEMENT, START_ELEMENT

 from defusedxml import pulldom  # type: ignore

-from .base import ICON_FOLDER, ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo
+from .base import ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo, ContentError
 from .template import TemplateContext
 from .tree import treeify

 LOG = logging.getLogger("autoindex")


+def assert_repodata_ok(condition: Any, msg: str):
+    if not condition:
+        raise ContentError(msg)
+
+
 def get_tag(elem: Element, name: str) -> Element:
    elems: list[Element] = elem.getElementsByTagName(name)  # type: ignore
+    assert_repodata_ok(len(elems) == 1, f"expected exactly one {name} tag")
    return elems[0]


 def get_text_tag(elem: Element, name: str) -> str:
    tagnode = get_tag(elem, name)
    child = tagnode.firstChild
-    # TODO: raise proper error if missing
-    assert child
-    return str(child.toxml())
+    assert_repodata_ok(child, f"missing text {name} tag")
+    return str(child.toxml())  # type: ignore


@dataclass
@ -39,7 +44,6 @@ class Package:
    def from_element(cls, elem: Element) -> "Package":
        return cls(
            href=get_tag(elem, "location").attributes["href"].value,
-            # TODO: tolerate some of these being absent or wrong.
            time=get_tag(elem, "time").attributes["file"].value,
            size=get_tag(elem, "size").attributes["package"].value,
        )
@ -104,10 +108,9 @@ class YumRepo(Repo):
        )
        if len(revision_nodes) == 1:
            timestamp_node = revision_nodes[0].firstChild
-            # TODO: raise proper error
-            assert timestamp_node
+            assert_repodata_ok(timestamp_node, "missing timestamp node")
            time = datetime.datetime.utcfromtimestamp(
-                int(timestamp_node.toxml())
+                int(timestamp_node.toxml())  # type: ignore
            ).isoformat()

        out.append(
@ -165,11 +168,10 @@ class YumRepo(Repo):
        primary_url = "/".join([self.base_url, href])
        primary_xml = await self.fetcher(primary_url)

-        # TODO: raise proper error if missing
-        assert primary_xml
+        assert_repodata_ok(primary_xml, f"missing primary XML at {primary_url}")

        return sorted(
-            [p.index_entry for p in self.__packages_from_primary(primary_xml)],
+            [p.index_entry for p in self.__packages_from_primary(primary_xml)],  # type: ignore
            key=lambda e: e.text,
        )

--- a/tests/test_yum_render_corrupt.py
+++ b/tests/test_yum_render_corrupt.py
@ -0,0 +1,132 @@
+from typing import Optional
+import textwrap
+
+from repo_autoindex import autoindex, ContentError
+
+REPOMD_XML = textwrap.dedent(
+    """
+    <?xml version="1.0" encoding="UTF-8"?>
+    <repomd xmlns="http://linux.duke.edu/metadata/repo" xmlns:rpm="http://linux.duke.edu/metadata/rpm">
+    <revision>1657165688</revision>
+    <data type="primary">
+        <checksum type="sha256">d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf</checksum>
+        <open-checksum type="sha256">6fc4eddd4e9de89246efba3815b8a9dec9dfe168e4fd3104cc792dff908a0f62</open-checksum>
+        <location href="repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"/>
+        <timestamp>1657165688</timestamp>
+        <size>2932</size>
+        <open-size>16585</open-size>
+    </data>
+    <data type="filelists">
+        <checksum type="sha256">284769ec79daa9e0a3b0129bb6260cc6271c90c4fe02b43dfa7cdf7635fb803f</checksum>
+        <open-checksum type="sha256">72f89223c8b0f6c7a2ee6ed7fbd16ee0bb395ca68260038bb3895265af84c29f</open-checksum>
+        <location href="repodata/284769ec79daa9e0a3b0129bb6260cc6271c90c4fe02b43dfa7cdf7635fb803f-filelists.xml.gz"/>
+        <timestamp>1657165688</timestamp>
+        <size>4621</size>
+        <open-size>36911</open-size>
+    </data>
+    <data type="other">
+        <checksum type="sha256">36c2195bbee0c39ee080969abc6fd59d943c3471114cfd43c6e776ac20d7ed21</checksum>
+        <open-checksum type="sha256">39f52cf295db14e863abcd7b2eede8e6c5e39ac9b2f194349459d29cd492c90f</open-checksum>
+        <location href="repodata/36c2195bbee0c39ee080969abc6fd59d943c3471114cfd43c6e776ac20d7ed21-other.xml.gz"/>
+        <timestamp>1657165688</timestamp>
+        <size>1408</size>
+        <open-size>8432</open-size>
+    </data>
+    <data type="primary_db">
+        <checksum type="sha256">55e6bfd00e889c5c1f9a3c9fb35a660158bc5d975ae082d434f3cf81cc2c0c21</checksum>
+        <open-checksum type="sha256">b2692c49d1d98d68e764e29108d8a81a3dfd9e04fa7665115853a029396d118d</open-checksum>
+        <location href="repodata/55e6bfd00e889c5c1f9a3c9fb35a660158bc5d975ae082d434f3cf81cc2c0c21-primary.sqlite.bz2"/>
+        <timestamp>1657165688</timestamp>
+        <size>7609</size>
+        <open-size>114688</open-size>
+        <database_version>10</database_version>
+    </data>
+    <data type="filelists_db">
+        <checksum type="sha256">de63a509812c37f7736fcef0b79e9c55dfe67a2d77006f74fdc442935103e9e6</checksum>
+        <open-checksum type="sha256">40eb5d53fe547c98d470813256c9bfc8a239b13697d8eb824a1485c9e186a0e3</open-checksum>
+        <location href="repodata/de63a509812c37f7736fcef0b79e9c55dfe67a2d77006f74fdc442935103e9e6-filelists.sqlite.bz2"/>
+        <timestamp>1657165688</timestamp>
+        <size>10323</size>
+        <open-size>65536</open-size>
+        <database_version>10</database_version>
+    </data>
+    <data type="other_db">
+        <checksum type="sha256">9aa39b62df200cb3784dea24092d0c1c686afff0cd0990c2ec7a61afe8896e1c</checksum>
+        <open-checksum type="sha256">3e5cefb10ce805b827e12ca3b4839bba873dc9403fd92b60a364bf6f312bd972</open-checksum>
+        <location href="repodata/9aa39b62df200cb3784dea24092d0c1c686afff0cd0990c2ec7a61afe8896e1c-other.sqlite.bz2"/>
+        <timestamp>1657165688</timestamp>
+        <size>2758</size>
+        <open-size>32768</open-size>
+        <database_version>10</database_version>
+    </data>
+    </repomd>
+"""
+).strip()
+
+PRIMARY_XML = textwrap.dedent(
+    """
+<?xml version="1.0" encoding="UTF-8"?>
+<metadata xmlns="http://linux.duke.edu/metadata/common" xmlns:rpm="http://linux.duke.edu/metadata/rpm" packages="5">
+<package type="rpm">
+  <name>
+"""
+).strip()
+
+
+class StaticFetcher:
+    def __init__(self):
+        self.content: dict[str, str] = {}
+
+    async def __call__(self, url: str) -> Optional[str]:
+        return self.content.get(url)
+
+
+async def test_corrupt_repodata():
+    fetcher = StaticFetcher()
+
+    fetcher.content["https://example.com/repodata/repomd.xml"] = REPOMD_XML
+    fetcher.content[
+        "https://example.com/repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"
+    ] = PRIMARY_XML
+
+    error = None
+    try:
+        async for _ in autoindex("https://example.com", fetcher=fetcher):
+            pass
+    except ContentError as exc:
+        error = exc
+
+    # It should have raised a ContentError
+    assert error
+
+    # It should summarize
+    assert "Invalid content found at https://example.com" in str(error)
+
+    # We don't want the test to depend on precise details, but it should have
+    # some cause coming from the XML parser
+    assert "xml" in error.__cause__.__module__
+
+
+async def test_missing_primary():
+    fetcher = StaticFetcher()
+
+    fetcher.content["https://example.com/repodata/repomd.xml"] = REPOMD_XML
+
+    error = None
+    try:
+        async for _ in autoindex("https://example.com", fetcher=fetcher):
+            pass
+    except ContentError as exc:
+        error = exc
+
+    # It should have raised a ContentError
+    assert error
+
+    # It should state the reason
+    assert (
+        "missing primary XML at https://example.com/repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"
+        in str(error)
+    )
+
+    # This one doesn't have a separate cause as it was raised explicitly by our code
+    assert not error.__cause__