repo-autoindex/repo_autoindex/_impl/api.py

import gzip
import logging
from collections.abc import AsyncGenerator, Awaitable, Callable
from typing import Optional, Type, BinaryIO
import tempfile
import io

import aiohttp


from .base import Fetcher, IOFetcher, GeneratedIndex, Repo, ContentError, FetcherError
from .yum import YumRepo
from .pulp import PulpFileRepo
from .kickstart import KickstartRepo

LOG = logging.getLogger("repo-autoindex")
REPO_TYPES: list[Type[Repo]] = [KickstartRepo, YumRepo, PulpFileRepo]


def http_fetcher(session: aiohttp.ClientSession) -> Fetcher:
    async def get_content_with_session(
        url: str,
    ) -> Optional[BinaryIO]:
        LOG.info("Fetching: %s", url)
        async with session.get(url) as resp:
            if resp.status == 404:
                # This error status means we successfully determined that
                # no content exists
                return None

            # Any other error status is fatal
            resp.raise_for_status()

            out: BinaryIO = tempfile.NamedTemporaryFile(prefix="repo-autoindex")  # type: ignore
            async for chunk in resp.content:
                out.write(chunk)
            out.flush()
            out.seek(0)

            # Deal with the non-ideal content negotiation
            # for certain storage backends.
            if url.endswith(".gz") and resp.content_type in (
                "application/gzip",
                "application/x-gzip",
                "application/octet-stream",
            ):
                out = gzip.GzipFile(fileobj=out)  # type: ignore

            return out

    return get_content_with_session


def wrapped_fetcher(fetcher: Fetcher) -> IOFetcher:
    # wraps a fetcher as passed in by the caller into an internal
    # fetcher enforcing certain behaviors:
    #
    # - wraps all exceptions in FetcherError
    #
    # - adapts 'str' outputs into io streams
    #
    async def new_fetcher(url: str) -> Optional[BinaryIO]:
        try:
            out = await fetcher(url)
            if isinstance(out, str):
                out = io.BytesIO(out.encode())
            return out
        except Exception as exc:
            raise FetcherError from exc

    return new_fetcher


async def autoindex(
    url: str,
    *,
    fetcher: Optional[Fetcher] = None,
    index_href_suffix: str = "",
) -> AsyncGenerator[GeneratedIndex, None]:
    """Generate HTML indexes for a repository.

    Arguments:
        url
            Base URL of repository to be indexed. The function will probe this URL
            for all supported repository types.

        fetcher
            An optional callable to customize the retrieval method for content in the
            repository. Can be omitted to use a basic HTTP(S) fetcher.

            A valid implementation must satisfy this contract:

            - it will be called with the absolute URL of content which may or may not exist
              within the repository (e.g.
              "https://example.com/some-yum-repo/repodata/repomd.xml" when probing a yum
              repository)

            - if the fetcher can determine, without error, that the requested content does not
              exist: it must return ``None``.

            - if the fetcher can retrieve the requested content, it must return the
              content at the given URL as a file-like object.

              Returning a ``str`` is also possible, but not recommended since it
              requires loading an entire file into memory at once, and some
              repositories contain very large files.

              Note that decompressing compressed files (such as bzipped XML in
              yum repositories) is the responsibility of the fetcher.

            - if the fetcher encounters an exception, it may allow the exception to
              propagate.

        index_href_suffix
            Suffix added onto any links between one generated index and another.

            For example, if the caller intends to save each generated index page as
            autoindex.html, then ``index_href_suffix="autoindex.html"`` should be passed
            so that any links between one index and another will use a correct URL.

            On the other hand, if the caller intends to save each generated index page
            as index.html and serve them via a web server which automatically serves
            files named index.html within each directory, the suffix can be left
            blank.

    Returns:
        An async generator producing zero or more instances of :class:`GeneratedIndex`.

        Zero indexes may be produced if the given URL doesn't represent a repository
        of any supported type.

    Raises:
        :class:`ContentError`
            Raised if indexed content appears to be invalid (for example, a yum repository
            has invalid repodata).

        :class:`Exception`
            Any exception raised by ``fetcher`` will propagate (for example, I/O errors or
            HTTP request failures).
    """
    if fetcher is None:
        async with aiohttp.ClientSession() as session:
            async for page in autoindex(
                url, fetcher=http_fetcher(session), index_href_suffix=index_href_suffix
            ):
                yield page
        return

    while url.endswith("/"):
        url = url[:-1]

    fetcher = wrapped_fetcher(fetcher)

    try:
        for repo_type in REPO_TYPES:
            repo = await repo_type.probe(fetcher, url)
            if repo:
                async for page in repo.render_index(
                    index_href_suffix=index_href_suffix
                ):
                    yield page
                break
    except FetcherError as exc:
        # FetcherErrors are unwrapped to propagate whatever was the original error
        assert exc.__cause__
        raise exc.__cause__ from None
    except ContentError:
        # explicitly raised ContentErrors are allowed to propagate
        raise
    except Exception as exc:
        # Any other errors are treated as a ContentError
        raise ContentError(f"Invalid content found at {url}") from exc
Initial implementation Basically working for yum repos. 2022-06-17 07:26:54 +10:00			`import gzip`
Add some basic tests & CI setup 2022-06-29 15:52:00 +10:00			`import logging`
Further reduce memory usage on large yum repos [RHELDST-20453] The Fetcher type was designed to return a 'str'. That wasn't a good idea because it implies that every fetched file must be loaded into memory completely. On certain large yum repos, decompressed primary XML can be hundreds of MB, and it's not appropriate to require loading that all into memory at once. Make it support a file-like object (stream of bytes). Since the SAX XML parser supports reading from a stream, this makes it possible to avoid loading everything into memory at once. A test of repo-autoindex CLI against /content/dist/rhel/server/7/7Server/x86_64/os showed major improvement: - before: ~1200MiB - after: ~80MiB Note that achieving the full improvement requires any downstream users of the library (e.g. exodus-gw) to update their Fetcher implementation as well, to stop returning a 'str'. 2023-09-21 10:24:13 +10:00			`from collections.abc import AsyncGenerator, Awaitable, Callable`
			`from typing import Optional, Type, BinaryIO`
			`import tempfile`
			`import io`
Initial implementation Basically working for yum repos. 2022-06-17 07:26:54 +10:00
			`import aiohttp`

Further reduce memory usage on large yum repos [RHELDST-20453] The Fetcher type was designed to return a 'str'. That wasn't a good idea because it implies that every fetched file must be loaded into memory completely. On certain large yum repos, decompressed primary XML can be hundreds of MB, and it's not appropriate to require loading that all into memory at once. Make it support a file-like object (stream of bytes). Since the SAX XML parser supports reading from a stream, this makes it possible to avoid loading everything into memory at once. A test of repo-autoindex CLI against /content/dist/rhel/server/7/7Server/x86_64/os showed major improvement: - before: ~1200MiB - after: ~80MiB Note that achieving the full improvement requires any downstream users of the library (e.g. exodus-gw) to update their Fetcher implementation as well, to stop returning a 'str'. 2023-09-21 10:24:13 +10:00
			`from .base import Fetcher, IOFetcher, GeneratedIndex, Repo, ContentError, FetcherError`
Initial implementation Basically working for yum repos. 2022-06-17 07:26:54 +10:00			`from .yum import YumRepo`
Add PULP_MANIFEST support 2022-08-08 13:19:32 +10:00			`from .pulp import PulpFileRepo`
Generate kickstart repo index [RHELDST-14528] Due to the presence of a "repodata/repomd.xml" path in a kickstart repo, repo-autoindex previously interpreted kickstart repos as yum repos. As such, a kickstart repo's index would solely consist of two directories: "Packages" and "repodata". While a kickstart repo does contain a yum repo, kickstart repos also contain two additional repo entry points: treeinfo and extra_files.json. Each entry point references additional files that should be included in a kickstart repo's index. These files were previously ignored. Now, when repo-autoindex encounters a kickstart repo, repo-autoindex produces a repo index that reflects the content referenced in all three repo entry points (repomd.xml, treeinfo, extra_files.json). 2023-03-23 13:58:07 -04:00			`from .kickstart import KickstartRepo`
Initial implementation Basically working for yum repos. 2022-06-17 07:26:54 +10:00
			`LOG = logging.getLogger("repo-autoindex")`
Generate kickstart repo index [RHELDST-14528] Due to the presence of a "repodata/repomd.xml" path in a kickstart repo, repo-autoindex previously interpreted kickstart repos as yum repos. As such, a kickstart repo's index would solely consist of two directories: "Packages" and "repodata". While a kickstart repo does contain a yum repo, kickstart repos also contain two additional repo entry points: treeinfo and extra_files.json. Each entry point references additional files that should be included in a kickstart repo's index. These files were previously ignored. Now, when repo-autoindex encounters a kickstart repo, repo-autoindex produces a repo index that reflects the content referenced in all three repo entry points (repomd.xml, treeinfo, extra_files.json). 2023-03-23 13:58:07 -04:00			`REPO_TYPES: list[Type[Repo]] = [KickstartRepo, YumRepo, PulpFileRepo]`
Initial implementation Basically working for yum repos. 2022-06-17 07:26:54 +10:00

			`def http_fetcher(session: aiohttp.ClientSession) -> Fetcher:`
Further reduce memory usage on large yum repos [RHELDST-20453] The Fetcher type was designed to return a 'str'. That wasn't a good idea because it implies that every fetched file must be loaded into memory completely. On certain large yum repos, decompressed primary XML can be hundreds of MB, and it's not appropriate to require loading that all into memory at once. Make it support a file-like object (stream of bytes). Since the SAX XML parser supports reading from a stream, this makes it possible to avoid loading everything into memory at once. A test of repo-autoindex CLI against /content/dist/rhel/server/7/7Server/x86_64/os showed major improvement: - before: ~1200MiB - after: ~80MiB Note that achieving the full improvement requires any downstream users of the library (e.g. exodus-gw) to update their Fetcher implementation as well, to stop returning a 'str'. 2023-09-21 10:24:13 +10:00			`async def get_content_with_session(`
			`url: str,`
			`) -> Optional[BinaryIO]:`
Initial implementation Basically working for yum repos. 2022-06-17 07:26:54 +10:00			`LOG.info("Fetching: %s", url)`
			`async with session.get(url) as resp:`
Add PULP_MANIFEST support 2022-08-08 13:19:32 +10:00			`if resp.status == 404:`
			`# This error status means we successfully determined that`
			`# no content exists`
			`return None`

			`# Any other error status is fatal`
Initial implementation Basically working for yum repos. 2022-06-17 07:26:54 +10:00			`resp.raise_for_status()`

Further reduce memory usage on large yum repos [RHELDST-20453] The Fetcher type was designed to return a 'str'. That wasn't a good idea because it implies that every fetched file must be loaded into memory completely. On certain large yum repos, decompressed primary XML can be hundreds of MB, and it's not appropriate to require loading that all into memory at once. Make it support a file-like object (stream of bytes). Since the SAX XML parser supports reading from a stream, this makes it possible to avoid loading everything into memory at once. A test of repo-autoindex CLI against /content/dist/rhel/server/7/7Server/x86_64/os showed major improvement: - before: ~1200MiB - after: ~80MiB Note that achieving the full improvement requires any downstream users of the library (e.g. exodus-gw) to update their Fetcher implementation as well, to stop returning a 'str'. 2023-09-21 10:24:13 +10:00			`out: BinaryIO = tempfile.NamedTemporaryFile(prefix="repo-autoindex") # type: ignore`
			`async for chunk in resp.content:`
			`out.write(chunk)`
			`out.flush()`
			`out.seek(0)`

Initial implementation Basically working for yum repos. 2022-06-17 07:26:54 +10:00			`# Deal with the non-ideal content negotiation`
			`# for certain storage backends.`
			`if url.endswith(".gz") and resp.content_type in (`
Fix tests for aiohttp 3.10 aiohttp 3.10 changed the server's handling of content-type/encoding headers for compressed files. This affects our usage of aiohttp.test_utils.TestServer: gzip-compressed files are no longer being decompressed by default. This in itself is not a problem, but the MIME type used ("application/gzip") did not match our client's list of automatically handled MIME types. Add it to the list to handle it in the same way as others. This fixes test_cmd::yum with aiohttp=>3.10. 2024-08-05 10:15:37 +10:00			`"application/gzip",`
Initial implementation Basically working for yum repos. 2022-06-17 07:26:54 +10:00			`"application/x-gzip",`
			`"application/octet-stream",`
			`):`
Further reduce memory usage on large yum repos [RHELDST-20453] The Fetcher type was designed to return a 'str'. That wasn't a good idea because it implies that every fetched file must be loaded into memory completely. On certain large yum repos, decompressed primary XML can be hundreds of MB, and it's not appropriate to require loading that all into memory at once. Make it support a file-like object (stream of bytes). Since the SAX XML parser supports reading from a stream, this makes it possible to avoid loading everything into memory at once. A test of repo-autoindex CLI against /content/dist/rhel/server/7/7Server/x86_64/os showed major improvement: - before: ~1200MiB - after: ~80MiB Note that achieving the full improvement requires any downstream users of the library (e.g. exodus-gw) to update their Fetcher implementation as well, to stop returning a 'str'. 2023-09-21 10:24:13 +10:00			`out = gzip.GzipFile(fileobj=out) # type: ignore`
Initial implementation Basically working for yum repos. 2022-06-17 07:26:54 +10:00
Further reduce memory usage on large yum repos [RHELDST-20453] The Fetcher type was designed to return a 'str'. That wasn't a good idea because it implies that every fetched file must be loaded into memory completely. On certain large yum repos, decompressed primary XML can be hundreds of MB, and it's not appropriate to require loading that all into memory at once. Make it support a file-like object (stream of bytes). Since the SAX XML parser supports reading from a stream, this makes it possible to avoid loading everything into memory at once. A test of repo-autoindex CLI against /content/dist/rhel/server/7/7Server/x86_64/os showed major improvement: - before: ~1200MiB - after: ~80MiB Note that achieving the full improvement requires any downstream users of the library (e.g. exodus-gw) to update their Fetcher implementation as well, to stop returning a 'str'. 2023-09-21 10:24:13 +10:00			`return out`
Initial implementation Basically working for yum repos. 2022-06-17 07:26:54 +10:00
			`return get_content_with_session`


Further reduce memory usage on large yum repos [RHELDST-20453] The Fetcher type was designed to return a 'str'. That wasn't a good idea because it implies that every fetched file must be loaded into memory completely. On certain large yum repos, decompressed primary XML can be hundreds of MB, and it's not appropriate to require loading that all into memory at once. Make it support a file-like object (stream of bytes). Since the SAX XML parser supports reading from a stream, this makes it possible to avoid loading everything into memory at once. A test of repo-autoindex CLI against /content/dist/rhel/server/7/7Server/x86_64/os showed major improvement: - before: ~1200MiB - after: ~80MiB Note that achieving the full improvement requires any downstream users of the library (e.g. exodus-gw) to update their Fetcher implementation as well, to stop returning a 'str'. 2023-09-21 10:24:13 +10:00			`def wrapped_fetcher(fetcher: Fetcher) -> IOFetcher:`
			`# wraps a fetcher as passed in by the caller into an internal`
			`# fetcher enforcing certain behaviors:`
			`#`
			`# - wraps all exceptions in FetcherError`
			`#`
			`# - adapts 'str' outputs into io streams`
			`#`
			`async def new_fetcher(url: str) -> Optional[BinaryIO]:`
Implement error handling Ultimately, all errors are propagated in some way, but it's important to differentiate between "the content was invalid" vs "failed to fetch the content". 2022-08-09 08:49:14 +10:00			`try:`
Further reduce memory usage on large yum repos [RHELDST-20453] The Fetcher type was designed to return a 'str'. That wasn't a good idea because it implies that every fetched file must be loaded into memory completely. On certain large yum repos, decompressed primary XML can be hundreds of MB, and it's not appropriate to require loading that all into memory at once. Make it support a file-like object (stream of bytes). Since the SAX XML parser supports reading from a stream, this makes it possible to avoid loading everything into memory at once. A test of repo-autoindex CLI against /content/dist/rhel/server/7/7Server/x86_64/os showed major improvement: - before: ~1200MiB - after: ~80MiB Note that achieving the full improvement requires any downstream users of the library (e.g. exodus-gw) to update their Fetcher implementation as well, to stop returning a 'str'. 2023-09-21 10:24:13 +10:00			`out = await fetcher(url)`
			`if isinstance(out, str):`
			`out = io.BytesIO(out.encode())`
			`return out`
Implement error handling Ultimately, all errors are propagated in some way, but it's important to differentiate between "the content was invalid" vs "failed to fetch the content". 2022-08-09 08:49:14 +10:00			`except Exception as exc:`
			`raise FetcherError from exc`

			`return new_fetcher`


Initial implementation Basically working for yum repos. 2022-06-17 07:26:54 +10:00			`async def autoindex(`
			`url: str,`
			`*,`
Make it all pass mypy 2022-06-29 16:19:05 +10:00			`fetcher: Optional[Fetcher] = None,`
Initial implementation Basically working for yum repos. 2022-06-17 07:26:54 +10:00			`index_href_suffix: str = "",`
			`) -> AsyncGenerator[GeneratedIndex, None]:`
Add some documentation 2022-08-08 09:50:52 +10:00			`"""Generate HTML indexes for a repository.`

			`Arguments:`
			`url`
			`Base URL of repository to be indexed. The function will probe this URL`
			`for all supported repository types.`

			`fetcher`
			`An optional callable to customize the retrieval method for content in the`
			`repository. Can be omitted to use a basic HTTP(S) fetcher.`

			`A valid implementation must satisfy this contract:`

			`- it will be called with the absolute URL of content which may or may not exist`
			`within the repository (e.g.`
			`"https://example.com/some-yum-repo/repodata/repomd.xml" when probing a yum`
			`repository)`

			`- if the fetcher can determine, without error, that the requested content does not`
			exist: it must return ``None``.

Further reduce memory usage on large yum repos [RHELDST-20453] The Fetcher type was designed to return a 'str'. That wasn't a good idea because it implies that every fetched file must be loaded into memory completely. On certain large yum repos, decompressed primary XML can be hundreds of MB, and it's not appropriate to require loading that all into memory at once. Make it support a file-like object (stream of bytes). Since the SAX XML parser supports reading from a stream, this makes it possible to avoid loading everything into memory at once. A test of repo-autoindex CLI against /content/dist/rhel/server/7/7Server/x86_64/os showed major improvement: - before: ~1200MiB - after: ~80MiB Note that achieving the full improvement requires any downstream users of the library (e.g. exodus-gw) to update their Fetcher implementation as well, to stop returning a 'str'. 2023-09-21 10:24:13 +10:00			`- if the fetcher can retrieve the requested content, it must return the`
			`content at the given URL as a file-like object.`

			Returning a ``str`` is also possible, but not recommended since it
			`requires loading an entire file into memory at once, and some`
			`repositories contain very large files.`

			`Note that decompressing compressed files (such as bzipped XML in`
			`yum repositories) is the responsibility of the fetcher.`
Add some documentation 2022-08-08 09:50:52 +10:00
			`- if the fetcher encounters an exception, it may allow the exception to`
			`propagate.`

			`index_href_suffix`
			`Suffix added onto any links between one generated index and another.`

			`For example, if the caller intends to save each generated index page as`
			autoindex.html, then ``index_href_suffix="autoindex.html"`` should be passed
			`so that any links between one index and another will use a correct URL.`

			`On the other hand, if the caller intends to save each generated index page`
			`as index.html and serve them via a web server which automatically serves`
			`files named index.html within each directory, the suffix can be left`
			`blank.`

			`Returns:`
			An async generator producing zero or more instances of :class:`GeneratedIndex`.

			`Zero indexes may be produced if the given URL doesn't represent a repository`
			`of any supported type.`
Implement error handling Ultimately, all errors are propagated in some way, but it's important to differentiate between "the content was invalid" vs "failed to fetch the content". 2022-08-09 08:49:14 +10:00
			`Raises:`
			:class:`ContentError`
			`Raised if indexed content appears to be invalid (for example, a yum repository`
			`has invalid repodata).`

			:class:`Exception`
			Any exception raised by ``fetcher`` will propagate (for example, I/O errors or
			`HTTP request failures).`
Add some documentation 2022-08-08 09:50:52 +10:00			`"""`
Initial implementation Basically working for yum repos. 2022-06-17 07:26:54 +10:00			`if fetcher is None:`
			`async with aiohttp.ClientSession() as session:`
			`async for page in autoindex(`
			`url, fetcher=http_fetcher(session), index_href_suffix=index_href_suffix`
			`):`
			`yield page`
			`return`

			`while url.endswith("/"):`
			`url = url[:-1]`

Further reduce memory usage on large yum repos [RHELDST-20453] The Fetcher type was designed to return a 'str'. That wasn't a good idea because it implies that every fetched file must be loaded into memory completely. On certain large yum repos, decompressed primary XML can be hundreds of MB, and it's not appropriate to require loading that all into memory at once. Make it support a file-like object (stream of bytes). Since the SAX XML parser supports reading from a stream, this makes it possible to avoid loading everything into memory at once. A test of repo-autoindex CLI against /content/dist/rhel/server/7/7Server/x86_64/os showed major improvement: - before: ~1200MiB - after: ~80MiB Note that achieving the full improvement requires any downstream users of the library (e.g. exodus-gw) to update their Fetcher implementation as well, to stop returning a 'str'. 2023-09-21 10:24:13 +10:00			`fetcher = wrapped_fetcher(fetcher)`
Implement error handling Ultimately, all errors are propagated in some way, but it's important to differentiate between "the content was invalid" vs "failed to fetch the content". 2022-08-09 08:49:14 +10:00
			`try:`
			`for repo_type in REPO_TYPES:`
			`repo = await repo_type.probe(fetcher, url)`
			`if repo:`
			`async for page in repo.render_index(`
			`index_href_suffix=index_href_suffix`
			`):`
			`yield page`
Generate kickstart repo index [RHELDST-14528] Due to the presence of a "repodata/repomd.xml" path in a kickstart repo, repo-autoindex previously interpreted kickstart repos as yum repos. As such, a kickstart repo's index would solely consist of two directories: "Packages" and "repodata". While a kickstart repo does contain a yum repo, kickstart repos also contain two additional repo entry points: treeinfo and extra_files.json. Each entry point references additional files that should be included in a kickstart repo's index. These files were previously ignored. Now, when repo-autoindex encounters a kickstart repo, repo-autoindex produces a repo index that reflects the content referenced in all three repo entry points (repomd.xml, treeinfo, extra_files.json). 2023-03-23 13:58:07 -04:00			`break`
Implement error handling Ultimately, all errors are propagated in some way, but it's important to differentiate between "the content was invalid" vs "failed to fetch the content". 2022-08-09 08:49:14 +10:00			`except FetcherError as exc:`
			`# FetcherErrors are unwrapped to propagate whatever was the original error`
			`assert exc.__cause__`
			`raise exc.__cause__ from None`
			`except ContentError:`
			`# explicitly raised ContentErrors are allowed to propagate`
			`raise`
			`except Exception as exc:`
			`# Any other errors are treated as a ContentError`
			`raise ContentError(f"Invalid content found at {url}") from exc`