From 293f5887b7baa1a2d1f48e9e102eebbf893e6709 Mon Sep 17 00:00:00 2001
From: Rohan McGovern <rmcgover@redhat.com>
Date: Tue, 9 Aug 2022 08:49:14 +1000
Subject: [PATCH] Implement error handling

Ultimately, all errors are propagated in some way, but it's important to
differentiate between "the content was invalid" vs "failed to fetch the
content".
---
 repo_autoindex/__init__.py       |   7 +-
 repo_autoindex/_impl/api.py      |  47 +++++++++--
 repo_autoindex/_impl/base.py     |  16 ++++
 repo_autoindex/_impl/yum.py      |  28 ++++---
 tests/test_yum_render_corrupt.py | 132 +++++++++++++++++++++++++++++++
 5 files changed, 209 insertions(+), 21 deletions(-)
 create mode 100644 tests/test_yum_render_corrupt.py

diff --git a/repo_autoindex/__init__.py b/repo_autoindex/__init__.py
index b472b0b..77fcc0c 100644
--- a/repo_autoindex/__init__.py
+++ b/repo_autoindex/__init__.py
@@ -1,4 +1,7 @@
 from ._impl.api import autoindex
-from ._impl.base import Fetcher, GeneratedIndex
+from ._impl.base import Fetcher, GeneratedIndex, ContentError
 
-__all__ = ["autoindex", "Fetcher", "GeneratedIndex"]
+ContentError.__module__ = "repo_autoindex"
+
+
+__all__ = ["autoindex", "ContentError", "Fetcher", "GeneratedIndex"]
diff --git a/repo_autoindex/_impl/api.py b/repo_autoindex/_impl/api.py
index 12d8df2..b094190 100644
--- a/repo_autoindex/_impl/api.py
+++ b/repo_autoindex/_impl/api.py
@@ -5,7 +5,7 @@ from typing import Optional, Type
 
 import aiohttp
 
-from .base import Fetcher, GeneratedIndex, Repo
+from .base import Fetcher, GeneratedIndex, Repo, ContentError, FetcherError
 from .yum import YumRepo
 from .pulp import PulpFileRepo
 
@@ -40,6 +40,17 @@ def http_fetcher(session: aiohttp.ClientSession) -> Fetcher:
     return get_content_with_session
 
 
+def with_error_handling(fetcher: Fetcher) -> Fetcher:
+    # wraps a fetcher such that any raised exceptions are wrapped into FetcherError
+    async def new_fetcher(url: str) -> Optional[str]:
+        try:
+            return await fetcher(url)
+        except Exception as exc:
+            raise FetcherError from exc
+
+    return new_fetcher
+
+
 async def autoindex(
     url: str,
     *,
@@ -91,6 +102,15 @@ async def autoindex(
 
         Zero indexes may be produced if the given URL doesn't represent a repository
         of any supported type.
+
+    Raises:
+        :class:`ContentError`
+            Raised if indexed content appears to be invalid (for example, a yum repository
+            has invalid repodata).
+
+        :class:`Exception`
+            Any exception raised by ``fetcher`` will propagate (for example, I/O errors or
+            HTTP request failures).
     """
     if fetcher is None:
         async with aiohttp.ClientSession() as session:
@@ -103,8 +123,23 @@ async def autoindex(
     while url.endswith("/"):
         url = url[:-1]
 
-    for repo_type in REPO_TYPES:
-        repo = await repo_type.probe(fetcher, url)
-        if repo:
-            async for page in repo.render_index(index_href_suffix=index_href_suffix):
-                yield page
+    fetcher = with_error_handling(fetcher)
+
+    try:
+        for repo_type in REPO_TYPES:
+            repo = await repo_type.probe(fetcher, url)
+            if repo:
+                async for page in repo.render_index(
+                    index_href_suffix=index_href_suffix
+                ):
+                    yield page
+    except FetcherError as exc:
+        # FetcherErrors are unwrapped to propagate whatever was the original error
+        assert exc.__cause__
+        raise exc.__cause__ from None
+    except ContentError:
+        # explicitly raised ContentErrors are allowed to propagate
+        raise
+    except Exception as exc:
+        # Any other errors are treated as a ContentError
+        raise ContentError(f"Invalid content found at {url}") from exc
diff --git a/repo_autoindex/_impl/base.py b/repo_autoindex/_impl/base.py
index a7af5e9..e9954a7 100644
--- a/repo_autoindex/_impl/base.py
+++ b/repo_autoindex/_impl/base.py
@@ -15,6 +15,22 @@ ICON_QCOW = "🐮"
 ICON_OTHER = "  "
 
 
+class ContentError(Exception):
+    """An error raised when indexed content appears to be invalid.
+
+    Errors of this type are raised when repo-autoindex is able to successfully
+    retrieve content and determine a repository type but fails to parse
+    repository metadata. For example, a corrupt yum repository may cause this
+    error to be raised.
+    """
+
+
+class FetcherError(Exception):
+    # Internal-only error used to separate exceptions raised by fetchers from
+    # exceptions raised by anything else.
+    pass
+
+
 @dataclass
 class GeneratedIndex:
     """A single HTML index page generated by repo-autoindex."""
diff --git a/repo_autoindex/_impl/yum.py b/repo_autoindex/_impl/yum.py
index 3d3366f..c46ccfa 100644
--- a/repo_autoindex/_impl/yum.py
+++ b/repo_autoindex/_impl/yum.py
@@ -3,30 +3,35 @@ import logging
 import os
 from collections.abc import AsyncGenerator, Generator, Iterable
 from dataclasses import dataclass
-from typing import Optional, Type
+from typing import Optional, Type, Any, TypeVar, NoReturn, overload
 from xml.dom.minidom import Element
-from xml.dom.pulldom import END_ELEMENT, START_ELEMENT, DOMEventStream
+from xml.dom.pulldom import END_ELEMENT, START_ELEMENT
 
 from defusedxml import pulldom  # type: ignore
 
-from .base import ICON_FOLDER, ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo
+from .base import ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo, ContentError
 from .template import TemplateContext
 from .tree import treeify
 
 LOG = logging.getLogger("autoindex")
 
 
+def assert_repodata_ok(condition: Any, msg: str):
+    if not condition:
+        raise ContentError(msg)
+
+
 def get_tag(elem: Element, name: str) -> Element:
     elems: list[Element] = elem.getElementsByTagName(name)  # type: ignore
+    assert_repodata_ok(len(elems) == 1, f"expected exactly one {name} tag")
     return elems[0]
 
 
 def get_text_tag(elem: Element, name: str) -> str:
     tagnode = get_tag(elem, name)
     child = tagnode.firstChild
-    # TODO: raise proper error if missing
-    assert child
-    return str(child.toxml())
+    assert_repodata_ok(child, f"missing text {name} tag")
+    return str(child.toxml())  # type: ignore
 
 
 @dataclass
@@ -39,7 +44,6 @@ class Package:
     def from_element(cls, elem: Element) -> "Package":
         return cls(
             href=get_tag(elem, "location").attributes["href"].value,
-            # TODO: tolerate some of these being absent or wrong.
             time=get_tag(elem, "time").attributes["file"].value,
             size=get_tag(elem, "size").attributes["package"].value,
         )
@@ -104,10 +108,9 @@ class YumRepo(Repo):
         )
         if len(revision_nodes) == 1:
             timestamp_node = revision_nodes[0].firstChild
-            # TODO: raise proper error
-            assert timestamp_node
+            assert_repodata_ok(timestamp_node, "missing timestamp node")
             time = datetime.datetime.utcfromtimestamp(
-                int(timestamp_node.toxml())
+                int(timestamp_node.toxml())  # type: ignore
             ).isoformat()
 
         out.append(
@@ -165,11 +168,10 @@ class YumRepo(Repo):
         primary_url = "/".join([self.base_url, href])
         primary_xml = await self.fetcher(primary_url)
 
-        # TODO: raise proper error if missing
-        assert primary_xml
+        assert_repodata_ok(primary_xml, f"missing primary XML at {primary_url}")
 
         return sorted(
-            [p.index_entry for p in self.__packages_from_primary(primary_xml)],
+            [p.index_entry for p in self.__packages_from_primary(primary_xml)],  # type: ignore
             key=lambda e: e.text,
         )
 
diff --git a/tests/test_yum_render_corrupt.py b/tests/test_yum_render_corrupt.py
new file mode 100644
index 0000000..7a7c567
--- /dev/null
+++ b/tests/test_yum_render_corrupt.py
@@ -0,0 +1,132 @@
+from typing import Optional
+import textwrap
+
+from repo_autoindex import autoindex, ContentError
+
+REPOMD_XML = textwrap.dedent(
+    """
+    <?xml version="1.0" encoding="UTF-8"?>
+    <repomd xmlns="http://linux.duke.edu/metadata/repo" xmlns:rpm="http://linux.duke.edu/metadata/rpm">
+    <revision>1657165688</revision>
+    <data type="primary">
+        <checksum type="sha256">d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf</checksum>
+        <open-checksum type="sha256">6fc4eddd4e9de89246efba3815b8a9dec9dfe168e4fd3104cc792dff908a0f62</open-checksum>
+        <location href="repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"/>
+        <timestamp>1657165688</timestamp>
+        <size>2932</size>
+        <open-size>16585</open-size>
+    </data>
+    <data type="filelists">
+        <checksum type="sha256">284769ec79daa9e0a3b0129bb6260cc6271c90c4fe02b43dfa7cdf7635fb803f</checksum>
+        <open-checksum type="sha256">72f89223c8b0f6c7a2ee6ed7fbd16ee0bb395ca68260038bb3895265af84c29f</open-checksum>
+        <location href="repodata/284769ec79daa9e0a3b0129bb6260cc6271c90c4fe02b43dfa7cdf7635fb803f-filelists.xml.gz"/>
+        <timestamp>1657165688</timestamp>
+        <size>4621</size>
+        <open-size>36911</open-size>
+    </data>
+    <data type="other">
+        <checksum type="sha256">36c2195bbee0c39ee080969abc6fd59d943c3471114cfd43c6e776ac20d7ed21</checksum>
+        <open-checksum type="sha256">39f52cf295db14e863abcd7b2eede8e6c5e39ac9b2f194349459d29cd492c90f</open-checksum>
+        <location href="repodata/36c2195bbee0c39ee080969abc6fd59d943c3471114cfd43c6e776ac20d7ed21-other.xml.gz"/>
+        <timestamp>1657165688</timestamp>
+        <size>1408</size>
+        <open-size>8432</open-size>
+    </data>
+    <data type="primary_db">
+        <checksum type="sha256">55e6bfd00e889c5c1f9a3c9fb35a660158bc5d975ae082d434f3cf81cc2c0c21</checksum>
+        <open-checksum type="sha256">b2692c49d1d98d68e764e29108d8a81a3dfd9e04fa7665115853a029396d118d</open-checksum>
+        <location href="repodata/55e6bfd00e889c5c1f9a3c9fb35a660158bc5d975ae082d434f3cf81cc2c0c21-primary.sqlite.bz2"/>
+        <timestamp>1657165688</timestamp>
+        <size>7609</size>
+        <open-size>114688</open-size>
+        <database_version>10</database_version>
+    </data>
+    <data type="filelists_db">
+        <checksum type="sha256">de63a509812c37f7736fcef0b79e9c55dfe67a2d77006f74fdc442935103e9e6</checksum>
+        <open-checksum type="sha256">40eb5d53fe547c98d470813256c9bfc8a239b13697d8eb824a1485c9e186a0e3</open-checksum>
+        <location href="repodata/de63a509812c37f7736fcef0b79e9c55dfe67a2d77006f74fdc442935103e9e6-filelists.sqlite.bz2"/>
+        <timestamp>1657165688</timestamp>
+        <size>10323</size>
+        <open-size>65536</open-size>
+        <database_version>10</database_version>
+    </data>
+    <data type="other_db">
+        <checksum type="sha256">9aa39b62df200cb3784dea24092d0c1c686afff0cd0990c2ec7a61afe8896e1c</checksum>
+        <open-checksum type="sha256">3e5cefb10ce805b827e12ca3b4839bba873dc9403fd92b60a364bf6f312bd972</open-checksum>
+        <location href="repodata/9aa39b62df200cb3784dea24092d0c1c686afff0cd0990c2ec7a61afe8896e1c-other.sqlite.bz2"/>
+        <timestamp>1657165688</timestamp>
+        <size>2758</size>
+        <open-size>32768</open-size>
+        <database_version>10</database_version>
+    </data>
+    </repomd>
+"""
+).strip()
+
+PRIMARY_XML = textwrap.dedent(
+    """
+<?xml version="1.0" encoding="UTF-8"?>
+<metadata xmlns="http://linux.duke.edu/metadata/common" xmlns:rpm="http://linux.duke.edu/metadata/rpm" packages="5">
+<package type="rpm">
+  <name>
+"""
+).strip()
+
+
+class StaticFetcher:
+    def __init__(self):
+        self.content: dict[str, str] = {}
+
+    async def __call__(self, url: str) -> Optional[str]:
+        return self.content.get(url)
+
+
+async def test_corrupt_repodata():
+    fetcher = StaticFetcher()
+
+    fetcher.content["https://example.com/repodata/repomd.xml"] = REPOMD_XML
+    fetcher.content[
+        "https://example.com/repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"
+    ] = PRIMARY_XML
+
+    error = None
+    try:
+        async for _ in autoindex("https://example.com", fetcher=fetcher):
+            pass
+    except ContentError as exc:
+        error = exc
+
+    # It should have raised a ContentError
+    assert error
+
+    # It should summarize
+    assert "Invalid content found at https://example.com" in str(error)
+
+    # We don't want the test to depend on precise details, but it should have
+    # some cause coming from the XML parser
+    assert "xml" in error.__cause__.__module__
+
+
+async def test_missing_primary():
+    fetcher = StaticFetcher()
+
+    fetcher.content["https://example.com/repodata/repomd.xml"] = REPOMD_XML
+
+    error = None
+    try:
+        async for _ in autoindex("https://example.com", fetcher=fetcher):
+            pass
+    except ContentError as exc:
+        error = exc
+
+    # It should have raised a ContentError
+    assert error
+
+    # It should state the reason
+    assert (
+        "missing primary XML at https://example.com/repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"
+        in str(error)
+    )
+
+    # This one doesn't have a separate cause as it was raised explicitly by our code
+    assert not error.__cause__