Use SAX instead of pulldom for primary.xml parsing [RHELDST-14338]

Redo the parsing of packages from primary.xml to use SAX; previously it was using pulldom. The motivation for the change is to reduce memory usage. When parsing a larger yum repo such as that contained within rhel-8-for-ppc64le-appstream-kickstart__8_DOT_4, the observed memory usage from repo-autoindex command was: - pulldom: ~700MB - SAX: ~85MB This does not affect the output of the indexing process, and is covered by existing tests.
2025-02-23 13:42:52 +00:00 · 2022-10-20 09:22:07 +10:00 · 2022-10-20 09:22:07 +10:00 · 117cabb0b7
commit 117cabb0b7
parent d4377907e2
1 changed files with 52 additions and 24 deletions
--- a/repo_autoindex/_impl/yum.py
+++ b/repo_autoindex/_impl/yum.py
@ -1,13 +1,14 @@
 import datetime
 import logging
 import os
-from collections.abc import AsyncGenerator, Generator, Iterable
+from collections.abc import AsyncGenerator, Generator, Iterable, Mapping
 from dataclasses import dataclass
 from typing import Optional, Type, Any, TypeVar, NoReturn, overload
 from xml.dom.minidom import Element
 from xml.dom.pulldom import END_ELEMENT, START_ELEMENT
+from xml.sax.handler import ContentHandler

-from defusedxml import pulldom  # type: ignore
+from defusedxml import pulldom, sax  # type: ignore

 from .base import ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo, ContentError
 from .template import TemplateContext
@ -40,14 +41,6 @@ class Package:
    time: str
    size: int

-    @classmethod
-    def from_element(cls, elem: Element) -> "Package":
-        return cls(
-            href=get_tag(elem, "location").attributes["href"].value,
-            time=get_tag(elem, "time").attributes["file"].value,
-            size=get_tag(elem, "size").attributes["package"].value,
-        )
-
    @property
    def index_entry(self) -> IndexEntry:
        return IndexEntry(
@ -81,6 +74,53 @@ def pulldom_elements(
            current_path.pop()


+class PackagesParser(ContentHandler):
+    # SAX-integrated parser to load Package instances from a primary XML.
+    #
+    # We use this rather than pulldom because the pulldom memory usage while
+    # parsing a large primary XML seems unreasonably high.
+    #
+    def __init__(self) -> None:
+        self.current_path: list[str] = []
+        self.current_package: Optional[Package] = None
+        self.packages: list[Package] = []
+
+    def parse(self, xmlstr: str) -> Iterable[Package]:
+        self.packages = []
+
+        # Parse the XML document; this will invoke our start/end element handlers
+        # which in turn will populate self.packages
+        sax.parseString(xmlstr.encode("utf-8"), self)
+
+        return self.packages
+
+    def startElement(self, name: str, attrs: Mapping[str, Any]):
+        self.current_path.append(name)
+        LOG.debug("entering element %s", self.current_path)
+
+        if self.current_path == ["metadata", "package"] and attrs.get("type") == "rpm":
+            self.current_package = Package("<unknown package>", "", 0)
+        elif self.current_path == ["metadata", "package", "location"]:
+            assert self.current_package
+            self.current_package.href = attrs["href"]
+        elif self.current_path == ["metadata", "package", "time"]:
+            assert self.current_package
+            self.current_package.time = attrs["file"]
+        elif self.current_path == ["metadata", "package", "size"]:
+            assert self.current_package
+            self.current_package.size = attrs["package"]
+
+    def endElement(self, _name: str):
+        LOG.debug("leaving element %s", self.current_path)
+
+        if self.current_path == ["metadata", "package"]:
+            assert self.current_package
+            self.packages.append(self.current_package)
+            self.current_package = None
+
+        self.current_path.pop()
+
+
 class YumRepo(Repo):
    async def render_index(
        self, index_href_suffix: str
@ -175,21 +215,9 @@ class YumRepo(Repo):
            key=lambda e: e.text,
        )

-    def __packages_from_primary(self, primary_xml: str) -> list[Package]:
+    def __packages_from_primary(self, primary_xml: str) -> Iterable[Package]:
        LOG.debug("primary xml: %s", primary_xml)
-
-        out = []
-        for elem in pulldom_elements(
-            primary_xml,
-            path_matcher=lambda p: p == ["metadata", "package"],
-            attr_matcher=lambda attrs: attrs.get("type")
-            and attrs["type"].value == "rpm",
-        ):
-            pkg = Package.from_element(elem)
-            if pkg:
-                out.append(pkg)
-
-        return out
+        return PackagesParser().parse(primary_xml)

    def __render_entries(
        self,