mirror of
https://github.com/release-engineering/repo-autoindex.git
synced 2025-02-24 06:02:54 +00:00
commit
7284e28078
5 changed files with 209 additions and 21 deletions
|
@ -1,4 +1,7 @@
|
|||
from ._impl.api import autoindex
|
||||
from ._impl.base import Fetcher, GeneratedIndex
|
||||
from ._impl.base import Fetcher, GeneratedIndex, ContentError
|
||||
|
||||
__all__ = ["autoindex", "Fetcher", "GeneratedIndex"]
|
||||
ContentError.__module__ = "repo_autoindex"
|
||||
|
||||
|
||||
__all__ = ["autoindex", "ContentError", "Fetcher", "GeneratedIndex"]
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Optional, Type
|
|||
|
||||
import aiohttp
|
||||
|
||||
from .base import Fetcher, GeneratedIndex, Repo
|
||||
from .base import Fetcher, GeneratedIndex, Repo, ContentError, FetcherError
|
||||
from .yum import YumRepo
|
||||
from .pulp import PulpFileRepo
|
||||
|
||||
|
@ -40,6 +40,17 @@ def http_fetcher(session: aiohttp.ClientSession) -> Fetcher:
|
|||
return get_content_with_session
|
||||
|
||||
|
||||
def with_error_handling(fetcher: Fetcher) -> Fetcher:
|
||||
# wraps a fetcher such that any raised exceptions are wrapped into FetcherError
|
||||
async def new_fetcher(url: str) -> Optional[str]:
|
||||
try:
|
||||
return await fetcher(url)
|
||||
except Exception as exc:
|
||||
raise FetcherError from exc
|
||||
|
||||
return new_fetcher
|
||||
|
||||
|
||||
async def autoindex(
|
||||
url: str,
|
||||
*,
|
||||
|
@ -91,6 +102,15 @@ async def autoindex(
|
|||
|
||||
Zero indexes may be produced if the given URL doesn't represent a repository
|
||||
of any supported type.
|
||||
|
||||
Raises:
|
||||
:class:`ContentError`
|
||||
Raised if indexed content appears to be invalid (for example, a yum repository
|
||||
has invalid repodata).
|
||||
|
||||
:class:`Exception`
|
||||
Any exception raised by ``fetcher`` will propagate (for example, I/O errors or
|
||||
HTTP request failures).
|
||||
"""
|
||||
if fetcher is None:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
|
@ -103,8 +123,23 @@ async def autoindex(
|
|||
while url.endswith("/"):
|
||||
url = url[:-1]
|
||||
|
||||
for repo_type in REPO_TYPES:
|
||||
repo = await repo_type.probe(fetcher, url)
|
||||
if repo:
|
||||
async for page in repo.render_index(index_href_suffix=index_href_suffix):
|
||||
yield page
|
||||
fetcher = with_error_handling(fetcher)
|
||||
|
||||
try:
|
||||
for repo_type in REPO_TYPES:
|
||||
repo = await repo_type.probe(fetcher, url)
|
||||
if repo:
|
||||
async for page in repo.render_index(
|
||||
index_href_suffix=index_href_suffix
|
||||
):
|
||||
yield page
|
||||
except FetcherError as exc:
|
||||
# FetcherErrors are unwrapped to propagate whatever was the original error
|
||||
assert exc.__cause__
|
||||
raise exc.__cause__ from None
|
||||
except ContentError:
|
||||
# explicitly raised ContentErrors are allowed to propagate
|
||||
raise
|
||||
except Exception as exc:
|
||||
# Any other errors are treated as a ContentError
|
||||
raise ContentError(f"Invalid content found at {url}") from exc
|
||||
|
|
|
@ -15,6 +15,22 @@ ICON_QCOW = "🐮"
|
|||
ICON_OTHER = " "
|
||||
|
||||
|
||||
class ContentError(Exception):
|
||||
"""An error raised when indexed content appears to be invalid.
|
||||
|
||||
Errors of this type are raised when repo-autoindex is able to successfully
|
||||
retrieve content and determine a repository type but fails to parse
|
||||
repository metadata. For example, a corrupt yum repository may cause this
|
||||
error to be raised.
|
||||
"""
|
||||
|
||||
|
||||
class FetcherError(Exception):
|
||||
# Internal-only error used to separate exceptions raised by fetchers from
|
||||
# exceptions raised by anything else.
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class GeneratedIndex:
|
||||
"""A single HTML index page generated by repo-autoindex."""
|
||||
|
|
|
@ -3,30 +3,35 @@ import logging
|
|||
import os
|
||||
from collections.abc import AsyncGenerator, Generator, Iterable
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Type
|
||||
from typing import Optional, Type, Any, TypeVar, NoReturn, overload
|
||||
from xml.dom.minidom import Element
|
||||
from xml.dom.pulldom import END_ELEMENT, START_ELEMENT, DOMEventStream
|
||||
from xml.dom.pulldom import END_ELEMENT, START_ELEMENT
|
||||
|
||||
from defusedxml import pulldom # type: ignore
|
||||
|
||||
from .base import ICON_FOLDER, ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo
|
||||
from .base import ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo, ContentError
|
||||
from .template import TemplateContext
|
||||
from .tree import treeify
|
||||
|
||||
LOG = logging.getLogger("autoindex")
|
||||
|
||||
|
||||
def assert_repodata_ok(condition: Any, msg: str):
|
||||
if not condition:
|
||||
raise ContentError(msg)
|
||||
|
||||
|
||||
def get_tag(elem: Element, name: str) -> Element:
|
||||
elems: list[Element] = elem.getElementsByTagName(name) # type: ignore
|
||||
assert_repodata_ok(len(elems) == 1, f"expected exactly one {name} tag")
|
||||
return elems[0]
|
||||
|
||||
|
||||
def get_text_tag(elem: Element, name: str) -> str:
|
||||
tagnode = get_tag(elem, name)
|
||||
child = tagnode.firstChild
|
||||
# TODO: raise proper error if missing
|
||||
assert child
|
||||
return str(child.toxml())
|
||||
assert_repodata_ok(child, f"missing text {name} tag")
|
||||
return str(child.toxml()) # type: ignore
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -39,7 +44,6 @@ class Package:
|
|||
def from_element(cls, elem: Element) -> "Package":
|
||||
return cls(
|
||||
href=get_tag(elem, "location").attributes["href"].value,
|
||||
# TODO: tolerate some of these being absent or wrong.
|
||||
time=get_tag(elem, "time").attributes["file"].value,
|
||||
size=get_tag(elem, "size").attributes["package"].value,
|
||||
)
|
||||
|
@ -104,10 +108,9 @@ class YumRepo(Repo):
|
|||
)
|
||||
if len(revision_nodes) == 1:
|
||||
timestamp_node = revision_nodes[0].firstChild
|
||||
# TODO: raise proper error
|
||||
assert timestamp_node
|
||||
assert_repodata_ok(timestamp_node, "missing timestamp node")
|
||||
time = datetime.datetime.utcfromtimestamp(
|
||||
int(timestamp_node.toxml())
|
||||
int(timestamp_node.toxml()) # type: ignore
|
||||
).isoformat()
|
||||
|
||||
out.append(
|
||||
|
@ -165,11 +168,10 @@ class YumRepo(Repo):
|
|||
primary_url = "/".join([self.base_url, href])
|
||||
primary_xml = await self.fetcher(primary_url)
|
||||
|
||||
# TODO: raise proper error if missing
|
||||
assert primary_xml
|
||||
assert_repodata_ok(primary_xml, f"missing primary XML at {primary_url}")
|
||||
|
||||
return sorted(
|
||||
[p.index_entry for p in self.__packages_from_primary(primary_xml)],
|
||||
[p.index_entry for p in self.__packages_from_primary(primary_xml)], # type: ignore
|
||||
key=lambda e: e.text,
|
||||
)
|
||||
|
||||
|
|
132
tests/test_yum_render_corrupt.py
Normal file
132
tests/test_yum_render_corrupt.py
Normal file
|
@ -0,0 +1,132 @@
|
|||
from typing import Optional
|
||||
import textwrap
|
||||
|
||||
from repo_autoindex import autoindex, ContentError
|
||||
|
||||
REPOMD_XML = textwrap.dedent(
|
||||
"""
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<repomd xmlns="http://linux.duke.edu/metadata/repo" xmlns:rpm="http://linux.duke.edu/metadata/rpm">
|
||||
<revision>1657165688</revision>
|
||||
<data type="primary">
|
||||
<checksum type="sha256">d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf</checksum>
|
||||
<open-checksum type="sha256">6fc4eddd4e9de89246efba3815b8a9dec9dfe168e4fd3104cc792dff908a0f62</open-checksum>
|
||||
<location href="repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"/>
|
||||
<timestamp>1657165688</timestamp>
|
||||
<size>2932</size>
|
||||
<open-size>16585</open-size>
|
||||
</data>
|
||||
<data type="filelists">
|
||||
<checksum type="sha256">284769ec79daa9e0a3b0129bb6260cc6271c90c4fe02b43dfa7cdf7635fb803f</checksum>
|
||||
<open-checksum type="sha256">72f89223c8b0f6c7a2ee6ed7fbd16ee0bb395ca68260038bb3895265af84c29f</open-checksum>
|
||||
<location href="repodata/284769ec79daa9e0a3b0129bb6260cc6271c90c4fe02b43dfa7cdf7635fb803f-filelists.xml.gz"/>
|
||||
<timestamp>1657165688</timestamp>
|
||||
<size>4621</size>
|
||||
<open-size>36911</open-size>
|
||||
</data>
|
||||
<data type="other">
|
||||
<checksum type="sha256">36c2195bbee0c39ee080969abc6fd59d943c3471114cfd43c6e776ac20d7ed21</checksum>
|
||||
<open-checksum type="sha256">39f52cf295db14e863abcd7b2eede8e6c5e39ac9b2f194349459d29cd492c90f</open-checksum>
|
||||
<location href="repodata/36c2195bbee0c39ee080969abc6fd59d943c3471114cfd43c6e776ac20d7ed21-other.xml.gz"/>
|
||||
<timestamp>1657165688</timestamp>
|
||||
<size>1408</size>
|
||||
<open-size>8432</open-size>
|
||||
</data>
|
||||
<data type="primary_db">
|
||||
<checksum type="sha256">55e6bfd00e889c5c1f9a3c9fb35a660158bc5d975ae082d434f3cf81cc2c0c21</checksum>
|
||||
<open-checksum type="sha256">b2692c49d1d98d68e764e29108d8a81a3dfd9e04fa7665115853a029396d118d</open-checksum>
|
||||
<location href="repodata/55e6bfd00e889c5c1f9a3c9fb35a660158bc5d975ae082d434f3cf81cc2c0c21-primary.sqlite.bz2"/>
|
||||
<timestamp>1657165688</timestamp>
|
||||
<size>7609</size>
|
||||
<open-size>114688</open-size>
|
||||
<database_version>10</database_version>
|
||||
</data>
|
||||
<data type="filelists_db">
|
||||
<checksum type="sha256">de63a509812c37f7736fcef0b79e9c55dfe67a2d77006f74fdc442935103e9e6</checksum>
|
||||
<open-checksum type="sha256">40eb5d53fe547c98d470813256c9bfc8a239b13697d8eb824a1485c9e186a0e3</open-checksum>
|
||||
<location href="repodata/de63a509812c37f7736fcef0b79e9c55dfe67a2d77006f74fdc442935103e9e6-filelists.sqlite.bz2"/>
|
||||
<timestamp>1657165688</timestamp>
|
||||
<size>10323</size>
|
||||
<open-size>65536</open-size>
|
||||
<database_version>10</database_version>
|
||||
</data>
|
||||
<data type="other_db">
|
||||
<checksum type="sha256">9aa39b62df200cb3784dea24092d0c1c686afff0cd0990c2ec7a61afe8896e1c</checksum>
|
||||
<open-checksum type="sha256">3e5cefb10ce805b827e12ca3b4839bba873dc9403fd92b60a364bf6f312bd972</open-checksum>
|
||||
<location href="repodata/9aa39b62df200cb3784dea24092d0c1c686afff0cd0990c2ec7a61afe8896e1c-other.sqlite.bz2"/>
|
||||
<timestamp>1657165688</timestamp>
|
||||
<size>2758</size>
|
||||
<open-size>32768</open-size>
|
||||
<database_version>10</database_version>
|
||||
</data>
|
||||
</repomd>
|
||||
"""
|
||||
).strip()
|
||||
|
||||
PRIMARY_XML = textwrap.dedent(
|
||||
"""
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<metadata xmlns="http://linux.duke.edu/metadata/common" xmlns:rpm="http://linux.duke.edu/metadata/rpm" packages="5">
|
||||
<package type="rpm">
|
||||
<name>
|
||||
"""
|
||||
).strip()
|
||||
|
||||
|
||||
class StaticFetcher:
|
||||
def __init__(self):
|
||||
self.content: dict[str, str] = {}
|
||||
|
||||
async def __call__(self, url: str) -> Optional[str]:
|
||||
return self.content.get(url)
|
||||
|
||||
|
||||
async def test_corrupt_repodata():
|
||||
fetcher = StaticFetcher()
|
||||
|
||||
fetcher.content["https://example.com/repodata/repomd.xml"] = REPOMD_XML
|
||||
fetcher.content[
|
||||
"https://example.com/repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"
|
||||
] = PRIMARY_XML
|
||||
|
||||
error = None
|
||||
try:
|
||||
async for _ in autoindex("https://example.com", fetcher=fetcher):
|
||||
pass
|
||||
except ContentError as exc:
|
||||
error = exc
|
||||
|
||||
# It should have raised a ContentError
|
||||
assert error
|
||||
|
||||
# It should summarize
|
||||
assert "Invalid content found at https://example.com" in str(error)
|
||||
|
||||
# We don't want the test to depend on precise details, but it should have
|
||||
# some cause coming from the XML parser
|
||||
assert "xml" in error.__cause__.__module__
|
||||
|
||||
|
||||
async def test_missing_primary():
|
||||
fetcher = StaticFetcher()
|
||||
|
||||
fetcher.content["https://example.com/repodata/repomd.xml"] = REPOMD_XML
|
||||
|
||||
error = None
|
||||
try:
|
||||
async for _ in autoindex("https://example.com", fetcher=fetcher):
|
||||
pass
|
||||
except ContentError as exc:
|
||||
error = exc
|
||||
|
||||
# It should have raised a ContentError
|
||||
assert error
|
||||
|
||||
# It should state the reason
|
||||
assert (
|
||||
"missing primary XML at https://example.com/repodata/d4888f04f95ac067af4d997d35c6d345cbe398563d777d017a3634c9ed6148cf-primary.xml.gz"
|
||||
in str(error)
|
||||
)
|
||||
|
||||
# This one doesn't have a separate cause as it was raised explicitly by our code
|
||||
assert not error.__cause__
|
Loading…
Add table
Reference in a new issue