From 117cabb0b713a267c27c3a678e9957a18bc5e916 Mon Sep 17 00:00:00 2001
From: Rohan McGovern <rmcgover@redhat.com>
Date: Thu, 20 Oct 2022 09:22:07 +1000
Subject: [PATCH] Use SAX instead of pulldom for primary.xml parsing
 [RHELDST-14338]

Redo the parsing of packages from primary.xml to use SAX; previously it
was using pulldom. The motivation for the change is to reduce memory usage.

When parsing a larger yum repo such as that contained within
rhel-8-for-ppc64le-appstream-kickstart__8_DOT_4, the observed memory
usage from repo-autoindex command was:

- pulldom: ~700MB
- SAX:     ~85MB

This does not affect the output of the indexing process, and is covered
by existing tests.
---
 repo_autoindex/_impl/yum.py | 76 +++++++++++++++++++++++++------------
 1 file changed, 52 insertions(+), 24 deletions(-)
diff --git a/repo_autoindex/_impl/yum.py b/repo_autoindex/_impl/yum.py
index 84b4fcc..5d1a9b7 100644
--- a/repo_autoindex/_impl/yum.py
+++ b/repo_autoindex/_impl/yum.py
@@ -1,13 +1,14 @@
 import datetime
 import logging
 import os
-from collections.abc import AsyncGenerator, Generator, Iterable
+from collections.abc import AsyncGenerator, Generator, Iterable, Mapping
 from dataclasses import dataclass
 from typing import Optional, Type, Any, TypeVar, NoReturn, overload
 from xml.dom.minidom import Element
 from xml.dom.pulldom import END_ELEMENT, START_ELEMENT
+from xml.sax.handler import ContentHandler
 
-from defusedxml import pulldom  # type: ignore
+from defusedxml import pulldom, sax  # type: ignore
 
 from .base import ICON_PACKAGE, Fetcher, GeneratedIndex, IndexEntry, Repo, ContentError
 from .template import TemplateContext
@@ -40,14 +41,6 @@ class Package:
     time: str
     size: int
 
-    @classmethod
-    def from_element(cls, elem: Element) -> "Package":
-        return cls(
-            href=get_tag(elem, "location").attributes["href"].value,
-            time=get_tag(elem, "time").attributes["file"].value,
-            size=get_tag(elem, "size").attributes["package"].value,
-        )
-
     @property
     def index_entry(self) -> IndexEntry:
         return IndexEntry(
@@ -81,6 +74,53 @@ def pulldom_elements(
             current_path.pop()
 
 
+class PackagesParser(ContentHandler):
+    # SAX-integrated parser to load Package instances from a primary XML.
+    #
+    # We use this rather than pulldom because the pulldom memory usage while
+    # parsing a large primary XML seems unreasonably high.
+    #
+    def __init__(self) -> None:
+        self.current_path: list[str] = []
+        self.current_package: Optional[Package] = None
+        self.packages: list[Package] = []
+
+    def parse(self, xmlstr: str) -> Iterable[Package]:
+        self.packages = []
+
+        # Parse the XML document; this will invoke our start/end element handlers
+        # which in turn will populate self.packages
+        sax.parseString(xmlstr.encode("utf-8"), self)
+
+        return self.packages
+
+    def startElement(self, name: str, attrs: Mapping[str, Any]):
+        self.current_path.append(name)
+        LOG.debug("entering element %s", self.current_path)
+
+        if self.current_path == ["metadata", "package"] and attrs.get("type") == "rpm":
+            self.current_package = Package("<unknown package>", "", 0)
+        elif self.current_path == ["metadata", "package", "location"]:
+            assert self.current_package
+            self.current_package.href = attrs["href"]
+        elif self.current_path == ["metadata", "package", "time"]:
+            assert self.current_package
+            self.current_package.time = attrs["file"]
+        elif self.current_path == ["metadata", "package", "size"]:
+            assert self.current_package
+            self.current_package.size = attrs["package"]
+
+    def endElement(self, _name: str):
+        LOG.debug("leaving element %s", self.current_path)
+
+        if self.current_path == ["metadata", "package"]:
+            assert self.current_package
+            self.packages.append(self.current_package)
+            self.current_package = None
+
+        self.current_path.pop()
+
+
 class YumRepo(Repo):
     async def render_index(
         self, index_href_suffix: str
@@ -175,21 +215,9 @@ class YumRepo(Repo):
             key=lambda e: e.text,
         )
 
-    def __packages_from_primary(self, primary_xml: str) -> list[Package]:
+    def __packages_from_primary(self, primary_xml: str) -> Iterable[Package]:
         LOG.debug("primary xml: %s", primary_xml)
-
-        out = []
-        for elem in pulldom_elements(
-            primary_xml,
-            path_matcher=lambda p: p == ["metadata", "package"],
-            attr_matcher=lambda attrs: attrs.get("type")
-            and attrs["type"].value == "rpm",
-        ):
-            pkg = Package.from_element(elem)
-            if pkg:
-                out.append(pkg)
-
-        return out
+        return PackagesParser().parse(primary_xml)
 
     def __render_entries(
         self,