mirror of
https://github.com/release-engineering/repo-autoindex.git
synced 2025-02-23 13:42:52 +00:00

The Fetcher type was designed to return a 'str'. That wasn't a good idea because it implies that every fetched file must be loaded into memory completely. On certain large yum repos, decompressed primary XML can be hundreds of MB, and it's not appropriate to require loading that all into memory at once. Make it support a file-like object (stream of bytes). Since the SAX XML parser supports reading from a stream, this makes it possible to avoid loading everything into memory at once. A test of repo-autoindex CLI against /content/dist/rhel/server/7/7Server/x86_64/os showed major improvement: - before: ~1200MiB - after: ~80MiB Note that achieving the full improvement requires any downstream users of the library (e.g. exodus-gw) to update their Fetcher implementation as well, to stop returning a 'str'.
67 lines
1.6 KiB
Python
67 lines
1.6 KiB
Python
import gzip
|
|
import pytest
|
|
from aiohttp import web
|
|
from repo_autoindex._impl.api import http_fetcher
|
|
|
|
|
|
class FakeReader:
|
|
def __init__(self, body: bytes):
|
|
self.body = body
|
|
self.iterating = False
|
|
|
|
def __aiter__(self):
|
|
self.iterating = True
|
|
return self
|
|
|
|
async def __anext__(self):
|
|
if not self.iterating:
|
|
raise StopAsyncIteration
|
|
self.iterating = False
|
|
return self.body
|
|
|
|
async def read(self):
|
|
return self.body
|
|
|
|
|
|
class FakeResponse:
|
|
def __init__(self, body: bytes, content_type: str):
|
|
self.body = body
|
|
self.content_type = content_type
|
|
self.status = 200
|
|
|
|
async def __aenter__(self):
|
|
return self
|
|
|
|
async def __aexit__(self, *_):
|
|
pass
|
|
|
|
def raise_for_status(self):
|
|
pass
|
|
|
|
@property
|
|
def content(self):
|
|
return FakeReader(self.body)
|
|
|
|
|
|
class FakeSession:
|
|
def __init__(self, body: bytes, content_type: str):
|
|
self.body = body
|
|
self.content_type = content_type
|
|
|
|
def get(self, url: str) -> FakeResponse:
|
|
return FakeResponse(self.body, self.content_type)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"content_type", ["application/x-gzip", "application/octet-stream"]
|
|
)
|
|
async def test_http_fetcher_decompresses(content_type: str):
|
|
"""http_fetcher will decompress certain responses."""
|
|
text = "some text"
|
|
compressed = gzip.compress(text.encode("utf-8"))
|
|
|
|
session = FakeSession(body=compressed, content_type=content_type)
|
|
fetcher = http_fetcher(session)
|
|
|
|
response = await fetcher("/some/path.gz")
|
|
assert response.read().decode() == text
|