mirror of
https://github.com/release-engineering/repo-autoindex.git
synced 2025-02-23 13:42:52 +00:00
Use SAX instead of pulldom for primary.xml parsing [RHELDST-14338]
Redo the parsing of packages from primary.xml to use SAX; previously it was using pulldom. The motivation for the change is to reduce memory usage. When parsing a larger yum repo such as that contained within rhel-8-for-ppc64le-appstream-kickstart__8_DOT_4, the observed memory usage from repo-autoindex command was: - pulldom: ~700MB - SAX: ~85MB This does not affect the output of the indexing process, and is covered by existing tests.
This commit is contained in:
parent
d4377907e2
commit
117cabb0b7
1 changed files with 52 additions and 24 deletions
|
@ -1,13 +1,14 @@
|
|||
import datetime
|
||||
import logging
|
||||
import os
|
||||
from collections.abc import AsyncGenerator, Generator, Iterable
|
||||
from collections.abc import AsyncGenerator, Generator, Iterable, Mapping
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Type, Any, TypeVar, NoReturn, overload
|
||||
from xml.dom.minidom import Element
|
||||
from xml.dom.pulldom import END_ELEMENT, START_ELEMENT
|
||||
from xml.sax.handler import ContentHandler
|
||||
|
||||
from defusedxml import pulldom # type: ignore
|
||||
from defusedxml import pulldom, sax # type: ignore
|
||||
|
||||
from .base import ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo, ContentError
|
||||
from .template import TemplateContext
|
||||
|
@ -40,14 +41,6 @@ class Package:
|
|||
time: str
|
||||
size: int
|
||||
|
||||
@classmethod
|
||||
def from_element(cls, elem: Element) -> "Package":
|
||||
return cls(
|
||||
href=get_tag(elem, "location").attributes["href"].value,
|
||||
time=get_tag(elem, "time").attributes["file"].value,
|
||||
size=get_tag(elem, "size").attributes["package"].value,
|
||||
)
|
||||
|
||||
@property
|
||||
def index_entry(self) -> IndexEntry:
|
||||
return IndexEntry(
|
||||
|
@ -81,6 +74,53 @@ def pulldom_elements(
|
|||
current_path.pop()
|
||||
|
||||
|
||||
class PackagesParser(ContentHandler):
|
||||
# SAX-integrated parser to load Package instances from a primary XML.
|
||||
#
|
||||
# We use this rather than pulldom because the pulldom memory usage while
|
||||
# parsing a large primary XML seems unreasonably high.
|
||||
#
|
||||
def __init__(self) -> None:
|
||||
self.current_path: list[str] = []
|
||||
self.current_package: Optional[Package] = None
|
||||
self.packages: list[Package] = []
|
||||
|
||||
def parse(self, xmlstr: str) -> Iterable[Package]:
|
||||
self.packages = []
|
||||
|
||||
# Parse the XML document; this will invoke our start/end element handlers
|
||||
# which in turn will populate self.packages
|
||||
sax.parseString(xmlstr.encode("utf-8"), self)
|
||||
|
||||
return self.packages
|
||||
|
||||
def startElement(self, name: str, attrs: Mapping[str, Any]):
|
||||
self.current_path.append(name)
|
||||
LOG.debug("entering element %s", self.current_path)
|
||||
|
||||
if self.current_path == ["metadata", "package"] and attrs.get("type") == "rpm":
|
||||
self.current_package = Package("<unknown package>", "", 0)
|
||||
elif self.current_path == ["metadata", "package", "location"]:
|
||||
assert self.current_package
|
||||
self.current_package.href = attrs["href"]
|
||||
elif self.current_path == ["metadata", "package", "time"]:
|
||||
assert self.current_package
|
||||
self.current_package.time = attrs["file"]
|
||||
elif self.current_path == ["metadata", "package", "size"]:
|
||||
assert self.current_package
|
||||
self.current_package.size = attrs["package"]
|
||||
|
||||
def endElement(self, _name: str):
|
||||
LOG.debug("leaving element %s", self.current_path)
|
||||
|
||||
if self.current_path == ["metadata", "package"]:
|
||||
assert self.current_package
|
||||
self.packages.append(self.current_package)
|
||||
self.current_package = None
|
||||
|
||||
self.current_path.pop()
|
||||
|
||||
|
||||
class YumRepo(Repo):
|
||||
async def render_index(
|
||||
self, index_href_suffix: str
|
||||
|
@ -175,21 +215,9 @@ class YumRepo(Repo):
|
|||
key=lambda e: e.text,
|
||||
)
|
||||
|
||||
def __packages_from_primary(self, primary_xml: str) -> list[Package]:
|
||||
def __packages_from_primary(self, primary_xml: str) -> Iterable[Package]:
|
||||
LOG.debug("primary xml: %s", primary_xml)
|
||||
|
||||
out = []
|
||||
for elem in pulldom_elements(
|
||||
primary_xml,
|
||||
path_matcher=lambda p: p == ["metadata", "package"],
|
||||
attr_matcher=lambda attrs: attrs.get("type")
|
||||
and attrs["type"].value == "rpm",
|
||||
):
|
||||
pkg = Package.from_element(elem)
|
||||
if pkg:
|
||||
out.append(pkg)
|
||||
|
||||
return out
|
||||
return PackagesParser().parse(primary_xml)
|
||||
|
||||
def __render_entries(
|
||||
self,
|
||||
|
|
Loading…
Add table
Reference in a new issue