Implement error handling

Ultimately, all errors are propagated in some way, but it's important to differentiate between "the content was invalid" vs "failed to fetch the content".
2025-02-23 13:42:52 +00:00 · 2022-08-09 08:49:14 +10:00 · 2022-08-09 08:49:14 +10:00 · 293f5887b7
commit 293f5887b7
parent 74f657c79e
5 changed files with 209 additions and 21 deletions
--- a/repo_autoindex/init.py
+++ b/repo_autoindex/init.py
@ -1,4 +1,7 @@
 from ._impl.api import autoindex
-from ._impl.base import Fetcher, GeneratedIndex
+from ._impl.base import Fetcher, GeneratedIndex, ContentError
-__all__ = ["autoindex", "Fetcher", "GeneratedIndex"]
+ContentError.__module__ = "repo_autoindex"
 __all__ = ["autoindex", "ContentError", "Fetcher", "GeneratedIndex"]
--- a/repo_autoindex/_impl/api.py
+++ b/repo_autoindex/_impl/api.py
@ -5,7 +5,7 @@ from typing import Optional, Type
 import aiohttp
-from .base import Fetcher, GeneratedIndex, Repo
+from .base import Fetcher, GeneratedIndex, Repo, ContentError, FetcherError
 from .yum import YumRepo
 from .pulp import PulpFileRepo
@ -40,6 +40,17 @@ def http_fetcher(session: aiohttp.ClientSession) -> Fetcher:
    return get_content_with_session
 def with_error_handling(fetcher: Fetcher) -> Fetcher:
    # wraps a fetcher such that any raised exceptions are wrapped into FetcherError
    async def new_fetcher(url: str) -> Optional[str]:
        try:
            return await fetcher(url)
        except Exception as exc:
            raise FetcherError from exc
    return new_fetcher
 async def autoindex(
    url: str,
    *,
@ -91,6 +102,15 @@ async def autoindex(
        Zero indexes may be produced if the given URL doesn't represent a repository
        of any supported type.
    Raises:
        :class:`ContentError`
            Raised if indexed content appears to be invalid (for example, a yum repository
            has invalid repodata).
        :class:`Exception`
            Any exception raised by ``fetcher`` will propagate (for example, I/O errors or
            HTTP request failures).
    """
    if fetcher is None:
        async with aiohttp.ClientSession() as session:
@ -103,8 +123,23 @@ async def autoindex(
    while url.endswith("/"):
        url = url[:-1]
-    for repo_type in REPO_TYPES:
+    fetcher = with_error_handling(fetcher)
-        repo = await repo_type.probe(fetcher, url)
+
-        if repo:
+    try:
-            async for page in repo.render_index(index_href_suffix=index_href_suffix):
+        for repo_type in REPO_TYPES:
-                yield page
+            repo = await repo_type.probe(fetcher, url)
            if repo:
                async for page in repo.render_index(
                    index_href_suffix=index_href_suffix
                ):
                    yield page
    except FetcherError as exc:
        # FetcherErrors are unwrapped to propagate whatever was the original error
        assert exc.__cause__
        raise exc.__cause__ from None
    except ContentError:
        # explicitly raised ContentErrors are allowed to propagate
        raise
    except Exception as exc:
        # Any other errors are treated as a ContentError
        raise ContentError(f"Invalid content found at {url}") from exc
--- a/repo_autoindex/_impl/base.py
+++ b/repo_autoindex/_impl/base.py
@ -15,6 +15,22 @@ ICON_QCOW = "🐮"
 ICON_OTHER = "  "
 class ContentError(Exception):
    """An error raised when indexed content appears to be invalid.
    Errors of this type are raised when repo-autoindex is able to successfully
    retrieve content and determine a repository type but fails to parse
    repository metadata. For example, a corrupt yum repository may cause this
    error to be raised.
    """
 class FetcherError(Exception):
    # Internal-only error used to separate exceptions raised by fetchers from
    # exceptions raised by anything else.
    pass
@dataclass
 class GeneratedIndex:
    """A single HTML index page generated by repo-autoindex."""
--- a/repo_autoindex/_impl/yum.py
+++ b/repo_autoindex/_impl/yum.py
@ -3,30 +3,35 @@ import logging
 import os
 from collections.abc import AsyncGenerator, Generator, Iterable
 from dataclasses import dataclass
-from typing import Optional, Type
+from typing import Optional, Type, Any, TypeVar, NoReturn, overload
 from xml.dom.minidom import Element
-from xml.dom.pulldom import END_ELEMENT, START_ELEMENT, DOMEventStream
+from xml.dom.pulldom import END_ELEMENT, START_ELEMENT
 from defusedxml import pulldom  # type: ignore
-from .base import ICON_FOLDER, ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo
+from .base import ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo, ContentError
 from .template import TemplateContext
 from .tree import treeify
 LOG = logging.getLogger("autoindex")
 def assert_repodata_ok(condition: Any, msg: str):
    if not condition:
        raise ContentError(msg)
 def get_tag(elem: Element, name: str) -> Element:
    elems: list[Element] = elem.getElementsByTagName(name)  # type: ignore
    assert_repodata_ok(len(elems) == 1, f"expected exactly one {name} tag")
    return elems[0]
 def get_text_tag(elem: Element, name: str) -> str:
    tagnode = get_tag(elem, name)
    child = tagnode.firstChild
-    # TODO: raise proper error if missing
+    assert_repodata_ok(child, f"missing text {name} tag")
-    assert child
+    return str(child.toxml())  # type: ignore
    return str(child.toxml())
@dataclass
@ -39,7 +44,6 @@ class Package:
    def from_element(cls, elem: Element) -> "Package":
        return cls(
            href=get_tag(elem, "location").attributes["href"].value,
            # TODO: tolerate some of these being absent or wrong.
            time=get_tag(elem, "time").attributes["file"].value,
            size=get_tag(elem, "size").attributes["package"].value,
        )
@ -104,10 +108,9 @@ class YumRepo(Repo):
        )
        if len(revision_nodes) == 1:
            timestamp_node = revision_nodes[0].firstChild
-            # TODO: raise proper error
+            assert_repodata_ok(timestamp_node, "missing timestamp node")
            assert timestamp_node
            time = datetime.datetime.utcfromtimestamp(
-                int(timestamp_node.toxml())
+                int(timestamp_node.toxml())  # type: ignore
            ).isoformat()
        out.append(
@ -165,11 +168,10 @@ class YumRepo(Repo):
        primary_url = "/".join([self.base_url, href])
        primary_xml = await self.fetcher(primary_url)
-        # TODO: raise proper error if missing
+        assert_repodata_ok(primary_xml, f"missing primary XML at {primary_url}")
        assert primary_xml
        return sorted(
-            [p.index_entry for p in self.__packages_from_primary(primary_xml)],
+            [p.index_entry for p in self.__packages_from_primary(primary_xml)],  # type: ignore
            key=lambda e: e.text,
        )
--- a/tests/test_yum_render_corrupt.py
+++ b/tests/test_yum_render_corrupt.py
@ -0,0 +1,132 @@
 from typing import Optional
 import textwrap
 from repo_autoindex import autoindex, ContentError
 REPOMD_XML = textwrap.dedent(
    """
    <?xml version="1.0" encoding="UTF-8"?>
    <repomd xmlns="http://linux.duke.edu/metadata/repo" xmlns:rpm="http://linux.duke.edu/metadata/rpm">
    <revision>1657165688</revision>
    <data type="primary">
        <checksum type="sha256">d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf</checksum>
        <open-checksum type="sha256">6fc4eddd4e9de89246efba3815b8a9dec9dfe168e4fd3104cc792dff908a0f62</open-checksum>
        <location href="repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"/>
        <timestamp>1657165688</timestamp>
        <size>2932</size>
        <open-size>16585</open-size>
    </data>
    <data type="filelists">
        <checksum type="sha256">284769ec79daa9e0a3b0129bb6260cc6271c90c4fe02b43dfa7cdf7635fb803f</checksum>
        <open-checksum type="sha256">72f89223c8b0f6c7a2ee6ed7fbd16ee0bb395ca68260038bb3895265af84c29f</open-checksum>
        <location href="repodata/284769ec79daa9e0a3b0129bb6260cc6271c90c4fe02b43dfa7cdf7635fb803f-filelists.xml.gz"/>
        <timestamp>1657165688</timestamp>
        <size>4621</size>
        <open-size>36911</open-size>
    </data>
    <data type="other">
        <checksum type="sha256">36c2195bbee0c39ee080969abc6fd59d943c3471114cfd43c6e776ac20d7ed21</checksum>
        <open-checksum type="sha256">39f52cf295db14e863abcd7b2eede8e6c5e39ac9b2f194349459d29cd492c90f</open-checksum>
        <location href="repodata/36c2195bbee0c39ee080969abc6fd59d943c3471114cfd43c6e776ac20d7ed21-other.xml.gz"/>
        <timestamp>1657165688</timestamp>
        <size>1408</size>
        <open-size>8432</open-size>
    </data>
    <data type="primary_db">
        <checksum type="sha256">55e6bfd00e889c5c1f9a3c9fb35a660158bc5d975ae082d434f3cf81cc2c0c21</checksum>
        <open-checksum type="sha256">b2692c49d1d98d68e764e29108d8a81a3dfd9e04fa7665115853a029396d118d</open-checksum>
        <location href="repodata/55e6bfd00e889c5c1f9a3c9fb35a660158bc5d975ae082d434f3cf81cc2c0c21-primary.sqlite.bz2"/>
        <timestamp>1657165688</timestamp>
        <size>7609</size>
        <open-size>114688</open-size>
        <database_version>10</database_version>
    </data>
    <data type="filelists_db">
        <checksum type="sha256">de63a509812c37f7736fcef0b79e9c55dfe67a2d77006f74fdc442935103e9e6</checksum>
        <open-checksum type="sha256">40eb5d53fe547c98d470813256c9bfc8a239b13697d8eb824a1485c9e186a0e3</open-checksum>
        <location href="repodata/de63a509812c37f7736fcef0b79e9c55dfe67a2d77006f74fdc442935103e9e6-filelists.sqlite.bz2"/>
        <timestamp>1657165688</timestamp>
        <size>10323</size>
        <open-size>65536</open-size>
        <database_version>10</database_version>
    </data>
    <data type="other_db">
        <checksum type="sha256">9aa39b62df200cb3784dea24092d0c1c686afff0cd0990c2ec7a61afe8896e1c</checksum>
        <open-checksum type="sha256">3e5cefb10ce805b827e12ca3b4839bba873dc9403fd92b60a364bf6f312bd972</open-checksum>
        <location href="repodata/9aa39b62df200cb3784dea24092d0c1c686afff0cd0990c2ec7a61afe8896e1c-other.sqlite.bz2"/>
        <timestamp>1657165688</timestamp>
        <size>2758</size>
        <open-size>32768</open-size>
        <database_version>10</database_version>
    </data>
    </repomd>
 """
 ).strip()
 PRIMARY_XML = textwrap.dedent(
    """
 <?xml version="1.0" encoding="UTF-8"?>
 <metadata xmlns="http://linux.duke.edu/metadata/common" xmlns:rpm="http://linux.duke.edu/metadata/rpm" packages="5">
 <package type="rpm">
  <name>
 """
 ).strip()
 class StaticFetcher:
    def __init__(self):
        self.content: dict[str, str] = {}
    async def __call__(self, url: str) -> Optional[str]:
        return self.content.get(url)
 async def test_corrupt_repodata():
    fetcher = StaticFetcher()
    fetcher.content["https://example.com/repodata/repomd.xml"] = REPOMD_XML
    fetcher.content[
        "https://example.com/repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"
    ] = PRIMARY_XML
    error = None
    try:
        async for _ in autoindex("https://example.com", fetcher=fetcher):
            pass
    except ContentError as exc:
        error = exc
    # It should have raised a ContentError
    assert error
    # It should summarize
    assert "Invalid content found at https://example.com" in str(error)
    # We don't want the test to depend on precise details, but it should have
    # some cause coming from the XML parser
    assert "xml" in error.__cause__.__module__
 async def test_missing_primary():
    fetcher = StaticFetcher()
    fetcher.content["https://example.com/repodata/repomd.xml"] = REPOMD_XML
    error = None
    try:
        async for _ in autoindex("https://example.com", fetcher=fetcher):
            pass
    except ContentError as exc:
        error = exc
    # It should have raised a ContentError
    assert error
    # It should state the reason
    assert (
        "missing primary XML at https://example.com/repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"
        in str(error)
    )
    # This one doesn't have a separate cause as it was raised explicitly by our code
    assert not error.__cause__