Fix tarball timestamps from git archive with Python.

Tito passes "git archive" a tree ID. The "git archive" man page states: git archive behaves differently when given a tree ID versus when given a commit ID or tag ID. In the first case the current time is used as the modification time of each file in the archive. Using the current time means that every time we build the source tarball, the file fingerprint will change since the metadata in the tarball changes. We don't want that since build systems track the fingerprint to see if the actual source has changed. This process was previously handled in an enigmatic Perl script that lacked any comments whatsoever. Converting it to well-commented Python makes the process less mysterious and speedier since Tito doesn't need to shell out to Perl.
2025-02-23 20:22:46 +00:00 · 2015-04-23 18:20:28 -04:00 · 2015-04-23 18:20:28 -04:00 · 09d89eb4ef
commit 09d89eb4ef
parent 35baac73e0
8 changed files with 385 additions and 139 deletions
--- a/bin/tar-fixup-stamp-comment.pl
+++ b/bin/tar-fixup-stamp-comment.pl
@ -1,127 +0,0 @@
-#!/usr/bin/perl
-#
-# Copyright (c) 2008-2009 Red Hat, Inc.
-#
-# This software is licensed to you under the GNU General Public License,
-# version 2 (GPLv2). There is NO WARRANTY for this software, express or
-# implied, including the implied warranties of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. You should have received a copy of GPLv2
-# along with this software; if not, see
-# http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
-#
-# Red Hat trademarks are not licensed under GPLv2. No permission is
-# granted to use or replicate Red Hat trademarks that are incorporated
-# in this software or its documentation.
-
-use strict;
-use warnings FATAL => 'all';
-
-use IO::Handle ();
-
-use constant RECORD_SIZE => 512;
-use constant GIT_BLOCK_SIZE => RECORD_SIZE * 20;
-
-my $stamp = shift;
-if (not defined $stamp) {
-	die "Please specify stamp to put into the tar as the first parameter.\n";
-}
-my $stamp_octal = sprintf "%011o", $stamp;
-my $comment = shift;
-if (defined $comment) {
-	if (not $comment =~ /^[0-9a-f]{40}$/) {
-		die "The comment we will put into the tar should be SHA1 in hex (40 characters).\n";
-	}
-}
-
-my $chunk;
-my $handle = \*STDIN;
-my $read;
-my $need_header = 1;
-my $total_len = 0;
-while ($read = $handle->sysread($chunk, RECORD_SIZE)) {
-	# print STDERR "read [$read]\n";
-	if ($read < RECORD_SIZE) {
-		my $rest = RECORD_SIZE - $read;
-		while (my $read = $handle->sysread($chunk, $rest, length($chunk))) {
-			# print STDERR "  plus [$read]\n";
-			$rest -= $read;
-		}
-	}
-
-	if ($chunk eq "\0" x 512) {
-		# look for the second record full of zeroes
-		my $pad;
-		my $read = $handle->sysread($pad, RECORD_SIZE);
-		if ($read) {
-			if ($read < RECORD_SIZE) {
-				my $rest = RECORD_SIZE - $read;
-				while (my $read = $handle->sysread($pad, $rest, length($pad))) {
-					$rest -= $read;
-				}
-			}
-		}
-		if ($pad ne "\0" x 512) {
-			die "Failed to find second stop record.\n";
-		}
-		print $chunk;
-		print $pad;
-		$total_len += length($chunk) + length($pad);
-		print "\0" x (padded_record_size($total_len, GIT_BLOCK_SIZE) - $total_len);
-		exit;
-	}
-
-	my ($name, $data1, $size, $mtime, $checksum, $link, $name2, $data2) = unpack 'A100 A24 A12 A12 A8 A1 A100 a*', $chunk;
-	my $block_size = $size ? padded_record_size( oct $size ) : $size;
-	# print STDERR "[$name] [$size] [$mtime] [$checksum] [$link] [$name2] [$block_size]\n";
-
-	if ($need_header and $link ne 'g' and defined $comment) {
-		my $header = pack 'a100 a8 a8 a8 a12 a12 A8 a1 a100 a6 a2 a32 a32 a8 a8 a155 x12',
-			'pax_global_header', (sprintf "%07o", 0666), '0000000', '0000000',
-			'00000000064', $stamp_octal, '', 'g', '',
-			'ustar', '00', 'root', 'root', '0000000', '0000000', '';
-		substr($header, 148, 8) = sprintf("%07o\0", unpack("%16C*", $header));
-		print $header;
-		print pack "a512", "52 comment=$comment\n";
-		$need_header = 0;
-		$total_len += 2 * 512;
-	}
-
-	my $out = $chunk;
-	my $write_comment = 0;
-	if ($mtime) {
-		substr($out, 136, 12) = pack "a12", $stamp_octal;
-		substr($out, 148, 8) = pack "A8", "";
-		substr($out, 148, 8) = sprintf("%07o\0", unpack("%16C*", $out));
-		if ($link eq 'g' and oct $size == 52) {
-			$write_comment = 1;
-		}
-	}
-	print $out;
-	$total_len += length $out;
-
-	my $payload;
-	while (my $read = $handle->sysread( $payload, $block_size )) {
-		if (defined $comment and $write_comment) {
-			if ($read < 52) {
-				die "Would like to put SHA1 into header but did not read at least 52 bytes.\n";
-			}
-			if (not $payload =~ /^52 comment=/) {
-				die "The header payload is not [52 comment=].\n";
-			}
-			substr($payload, 0, 52) = "52 comment=$comment\n";
-		}
-		# print STDERR "  payload [@{[ length $payload ]}]\n";
-		print $payload;
-		$total_len += length $payload;
-		$block_size -= $read;
-		last unless $block_size;
-	}
-}
-
-sub padded_record_size {
-	my $len = shift;
-	my $pad_size = shift || RECORD_SIZE;
-	my $out = int($len / $pad_size);
-	$out++ if $len % $pad_size;
-	return $out * $pad_size;
-}
--- a/setup.py
+++ b/setup.py
@ -37,7 +37,6 @@ setup(
    # non-python scripts go here
    scripts=[
        'bin/tito',
-        'bin/tar-fixup-stamp-comment.pl',
        'bin/test-setup-specfile.pl',
        'bin/generate-patches.pl'
    ],
--- a/src/tito/common.py
+++ b/src/tito/common.py
@ -24,6 +24,7 @@ from bugzilla.rhbugzilla import RHBugzilla
 from tito.compat import xmlrpclib, getstatusoutput
 from tito.exception import TitoException
 from tito.exception import RunCommandException
+from tito.tar import TarFixer, RECORD_SIZE

 DEFAULT_BUILD_DIR = "/tmp/tito"
 DEFAULT_BUILDER = "builder"
@ -645,16 +646,18 @@ def create_tgz(git_root, prefix, commit, relative_dir,
    os.chdir(os.path.abspath(git_root))
    timestamp = get_commit_timestamp(commit)

-    timestamp_script = get_script_path("tar-fixup-stamp-comment.pl")
-
    # Accomodate standalone projects with specfile i root of git repo:
    relative_git_dir = "%s" % relative_dir
    if relative_git_dir in ['/', './']:
        relative_git_dir = ""

+    basename = os.path.splitext(dest_tgz)[0]
+    initial_tar = "%s.initial" % basename
+
    # command to generate a git-archive
-    git_archive_cmd = 'git archive --format=tar --prefix=%s/ %s:%s' % (
-        prefix, commit, relative_git_dir)
+    git_archive_cmd = 'git archive --format=tar --prefix=%s/ %s:%s --output=%s' % (
+        prefix, commit, relative_git_dir, initial_tar)
+    run_command(git_archive_cmd)

    # Run git-archive separately if --debug was specified.
    # This allows us to detect failure early.
@ -662,12 +665,16 @@ def create_tgz(git_root, prefix, commit, relative_dir,
    debug('git-archive fails if relative dir is not in git tree',
        '%s > /dev/null' % git_archive_cmd)

-    # If we're still alive, the previous command worked
-    archive_cmd = ('%s | %s %s %s | gzip -n -c - > %s' % (
-        git_archive_cmd, timestamp_script,
-        timestamp, commit, dest_tgz))
-    debug(archive_cmd)
-    return run_command(archive_cmd)
+    fixed_tar = "%s.tar" % basename
+    fixed_tar_fh = open(fixed_tar, 'wb')
+    try:
+        tarfixer = TarFixer(open(initial_tar, 'rb', RECORD_SIZE), fixed_tar_fh, timestamp, commit)
+        tarfixer.fix()
+    finally:
+        fixed_tar_fh.close()
+
+    # It's a pity we can't use Python's gzip, but it doesn't offer an equivalent of -n
+    return run_command("gzip -n -c < %s > %s" % (fixed_tar, dest_tgz))


 def get_git_repo_url():
--- a/src/tito/tar.py
+++ b/src/tito/tar.py
@ -0,0 +1,296 @@
+# Copyright (c) 2008-2009 Red Hat, Inc.
+#
+# This software is licensed to you under the GNU General Public License,
+# version 2 (GPLv2). There is NO WARRANTY for this software, express or
+# implied, including the implied warranties of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. You should have received a copy of GPLv2
+# along with this software; if not, see
+# http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
+#
+# Red Hat trademarks are not licensed under GPLv2. No permission is
+# granted to use or replicate Red Hat trademarks that are incorporated
+# in this software or its documentation.
+
+import re
+import struct
+import sys
+
+RECORD_SIZE = 512
+
+# Git writes its tarballs to be a multiple of 10240.  I'm not sure why: the
+# implementation in archive-tar.c doesn't have any comments on the matter.
+GIT_BLOCK_SIZE = RECORD_SIZE * 20
+
+
+class TarFixer(object):
+    """Code for updating a tar header's mtime.  For details on the tar format
+    see http://www.gnu.org/software/tar/manual/html_node/Standard.html and
+    http://en.wikipedia.org/wiki/Tar_%28computing%29
+
+    Tito passes "git archive" a tree ID.  The "git archive" man page states:
+
+        git archive behaves differently when given a tree ID versus when given
+        a commit ID or tag ID. In the first case the current time is used as
+        the modification time of each file in the archive.
+
+    Using the current time means that every time we build the source tarball,
+    the file fingerprint will change since the metadata in the tarball changes.
+    We don't want that since build systems track the fingerprint to see if
+    the actual source has changed.
+
+    The resultant tarball will be in this format:
+
+        - Global header (512 bytes)
+        - Extended header block with git ref (512 bytes)
+        - [File header (512 bytes) + File data padded to multiple of 512] * number of files
+        - 1024 NUL bytes
+        - However many NUL bytes are necessary to pad the file to a multiple of GIT_BLOCK_SIZE
+
+    The block after the global header with the git ref is called an "extended header".
+    We are technically writing a "pax" archive because of the use of extensions.  According
+    to the comments in git's archive-tar.c
+
+        pax extended header records have the format "%u %s=%s\n".  %u contains
+        the size of the whole string (including the %u), the first %s is the
+        keyword, the second one is the value.
+    """
+    def __init__(self, fh, out, timestamp, gitref):
+        # As defined in tar.h
+        self.tar_struct = [
+            ('name', '100s'),
+            ('mode', '8s'),
+            ('uid', '8s'),
+            ('gid', '8s'),
+            ('size', '12s'),
+            ('mtime', '12s'),
+            ('checksum', '8s'),
+            ('typeflag', '1s'),
+            ('linkname', '100s'),
+            ('magic', '6s'),
+            ('version', '2s'),
+            ('uname', '32s'),
+            ('gname', '32s'),
+            ('devmajor', '8s'),
+            ('devminor', '8s'),
+            ('prefix', '155s'),
+        ]
+
+        # The items in the list below are zero-padded octal numbers in ASCII.
+        # All other fields are null-terminated character strings. Each numeric
+        # field of width w contains w minus 1 digits, and a null.
+        #
+        # The checksum is technically an octal_member but we handle it specially.
+        self.octal_members = [
+            'mode',
+            'uid',
+            'gid',
+            'size',
+            'mtime',
+            'devmajor',
+            'devminor',
+        ]
+
+        # Add an '=' to use native byte order with standard sizes
+        self.struct_template = "=" + "".join(map(lambda x: x[1], self.tar_struct))
+        self.struct_members = map(lambda x: x[0], self.tar_struct)
+        self.struct_hash = dict(self.tar_struct)
+
+        # The tarballs created by git archive from tree IDs don't have a global
+        # header for some reason.
+        self.need_header = True
+        self.done = False
+
+        # We need to track the total number of bytes we've written so we can
+        # pad out the final tarball to be a multiple of GIT_BLOCK_SIZE
+        self.total_length = 0
+
+        self.fh = fh
+        self.out = out
+        self.timestamp = int(timestamp)
+        self.gitref = gitref
+
+    def chunk_to_hash(self, chunk):
+        # Our struct template is only 500 bytes, but the last 12 bytes are NUL
+        # I elected to ignore them completely instead of including them in the
+        # template as '12x'.  The unpack_from method will read the bytes our
+        # template defines from chunk and discard the rest.
+        unpacked = struct.unpack_from(self.struct_template, chunk)
+
+        # Zip what we read together with the member names and create a dictionary
+        chunk_props = dict(zip(self.struct_members, unpacked))
+
+        return chunk_props
+
+    def padded_size(self, length, pad_size=RECORD_SIZE):
+        """Function to pad out a length to the nearest multiple of pad_size
+        that can contain it."""
+        blocks = length / pad_size
+        if length % pad_size != 0:
+            blocks += 1
+        return blocks * pad_size
+
+    def create_global_header(self):
+        header_props = {
+            'name': 'pax_global_header',
+            'mode': 0o666,
+            'uid': 0,
+            'gid': 0,
+            'size': 52,  # The size of the extended header with the gitref
+            'mtime': self.timestamp,
+            'typeflag': 'g',
+            'linkname': '',
+            'magic': 'ustar',
+            'version': '00',
+            'uname': 'root',
+            'gname': 'root',
+            'devmajor': 0,
+            'devminor': 0,
+            'prefix': '',
+        }
+        values = self.encode_header(header_props, header_props.keys())
+        header_props['checksum'] = self.calculate_checksum(values)
+        self.process_header(header_props)
+
+    def encode_header(self, chunk_props, members=None):
+        if members is None:
+            members = self.struct_members
+
+        pack_values = []
+        for member in members:
+            if member in self.octal_members:
+                # Pad out the octal value to the right length
+                member_template = self.struct_hash[member]
+                size = int(re.match('(\d+)', member_template).group(1)) - 1
+                size = str(size)
+                fmt = "%0" + size + "o\x00"
+                pack_values.append(fmt % chunk_props[member])
+            else:
+                pack_values.append(chunk_props[member])
+        return pack_values
+
+    def process_header(self, chunk_props):
+        """There is a header before every file and a global header at the top."""
+        pack_values = self.encode_header(chunk_props)
+        # The struct itself is only 500 bytes so we have to pad it to 512
+        data_out = struct.pack(self.struct_template + "12x", *pack_values)
+        self.out.write(data_out)
+        self.total_length += len(data_out)
+
+    def process_extended_header(self):
+        # Trash the original comment
+        _ = self.fh.read(RECORD_SIZE)
+        self.create_extended_header()
+
+    def create_extended_header(self):
+        # pax extended header records have the format "%u %s=%s\n".  %u contains
+        # the size of the whole string (including the %u), the first %s is the
+        # keyword, the second one is the value.
+        #
+        # Since the git ref is always 40 characters we can
+        # pre-compute the length to put in the extended header
+        comment = "52 comment=%s\n" % self.gitref
+        data_out = struct.pack("=512s", comment)
+        self.out.write(data_out)
+        self.total_length += len(data_out)
+
+    def process_file_data(self, size):
+        data_out = self.fh.read(self.padded_size(size))
+        self.out.write(data_out)
+        self.total_length += len(data_out)
+
+    def calculate_checksum(self, values):
+        """The checksum field is the ASCII representation of the octal value of the simple
+        sum of all bytes in the header block. Each 8-bit byte in the header is added
+        to an unsigned integer, initialized to zero, the precision of which shall be
+        no less than seventeen bits. When calculating the checksum, the checksum field is
+        treated as if it were all spaces.
+
+        Callers of this method are responsible for *not* sending in the previous checksum.
+        """
+
+        new_chksum = 0
+        for val in values:
+            for x in val:
+                new_chksum += ord(x)
+        for blank in " " * 8:
+            new_chksum += ord(blank)
+
+        return "%07o\x00" % new_chksum
+
+    def process_chunk(self, chunk):
+        # Tar archives end with two 512 byte blocks of zeroes
+        if chunk == "\x00" * 512:
+            self.out.write(chunk)
+            self.total_length += len(chunk)
+            if self.last_chunk_was_nulls:
+                self.out.write("\x00" * (self.padded_size(self.total_length, GIT_BLOCK_SIZE) - self.total_length))
+                self.done = True
+            self.last_chunk_was_nulls = True
+            return
+
+        self.last_chunk_was_nulls = False
+
+        chunk_props = self.chunk_to_hash(chunk)
+
+        # This line is the whole purpose of this class!
+        chunk_props['mtime'] = "%011o\x00" % self.timestamp
+
+        # Delete the old checksum since it's now invalid and we don't want to pass
+        # it in to calculate_checksum().
+        del(chunk_props['checksum'])
+        chunk_props['checksum'] = self.calculate_checksum(chunk_props.values())
+
+        # Remove the trailing NUL byte(s) on the end of members
+        for k, v in chunk_props.items():
+            chunk_props[k] = v.rstrip("\x00")
+
+        for member in self.octal_members:
+            # Convert octals to decimal
+            chunk_props[member] = int(chunk_props[member], 8)
+
+        # If there is no global header, we need to create one
+        if self.need_header:
+            # When run against a tree ID, git archive doesn't create
+            # a global header.  The first block is just the header for
+            # the first file.
+            if chunk_props['typeflag'] != 'g':
+                self.create_global_header()
+                self.create_extended_header()
+                self.process_header(chunk_props)
+            else:
+                self.process_header(chunk_props)
+                self.process_extended_header()
+            self.need_header = False
+        else:
+            self.process_header(chunk_props)
+            self.process_file_data(chunk_props['size'])
+
+    def fix(self):
+        try:
+            chunk = self.fh.read(RECORD_SIZE)
+            while chunk != "" and not self.done:
+                self.process_chunk(chunk)
+                chunk = self.fh.read(RECORD_SIZE)
+        finally:
+            self.fh.close()
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 4:
+        sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE" % sys.argv[0])
+
+    try:
+        timestamp = int(sys.argv[1])
+    except:
+        sys.exit("UNIX_TIMESTAMP must be an integer")
+
+    gitref = sys.argv[2]
+    tar_file = sys.argv[3]
+
+    try:
+        fh = open(tar_file, 'rb', RECORD_SIZE)
+    except:
+        print("Could not read %s" % tar_file)
+
+    reader = TarFixer(fh, sys.stdout, timestamp, gitref)
+    reader.fix()
--- a/test/unit/resources/archive-fixed.tar
+++ b/test/unit/resources/archive-fixed.tar
--- a/test/unit/resources/archive.tar
+++ b/test/unit/resources/archive.tar
--- a/test/unit/test_tar.py
+++ b/test/unit/test_tar.py
@ -0,0 +1,72 @@
+import hashlib
+import os
+import unittest
+
+from StringIO import StringIO
+from tito.tar import TarFixer
+
+EXPECTED_TIMESTAMP = 1429725106
+EXPECTED_REF = "3518d720bff20db887b7a5e5dddd411d14dca1f9"
+
+
+class TarTest(unittest.TestCase):
+    def setUp(self):
+        self.out = StringIO()
+        self.tarfixer = TarFixer(None, self.out, EXPECTED_TIMESTAMP, EXPECTED_REF)
+        self.test_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive.tar')
+        self.reference_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive-fixed.tar')
+        self.reference_hash = self.hash_file(self.reference_file)
+
+    def tearDown(self):
+        self.out = None
+
+    def hash_file(self, filename):
+        return self.hash_buffer(open(filename, 'rb').read())
+
+    def hash_buffer(self, buf):
+        hasher = hashlib.sha256()
+        hasher.update(buf)
+        return hasher.hexdigest()
+
+    def test_fix(self):
+        self.fh = open(self.test_file)
+        self.tarfixer.fh = self.fh
+        self.tarfixer.fix()
+        self.assertEqual(self.reference_hash, self.hash_buffer("".join(self.out.buflist)))
+
+    def test_padded_size_length_small(self):
+        length = 10
+        block_size = 512
+        self.assertEqual(512, self.tarfixer.padded_size(length, block_size))
+
+    def test_padded_size_length_spot_on(self):
+        length = 512
+        block_size = 512
+        self.assertEqual(512, self.tarfixer.padded_size(length, block_size))
+
+    def test_padded_size_length_over(self):
+        length = 513
+        block_size = 512
+        self.assertEqual(1024, self.tarfixer.padded_size(length, block_size))
+
+    def test_create_extended_header(self):
+        self.tarfixer.create_extended_header()
+        header = "".join(self.out.buflist)
+        self.assertEqual(512, len(header))
+        self.assertEqual("52 comment=%s\n" % EXPECTED_REF, header[:52])
+        self.assertEqual("\x00" * (512 - 53), header[53:])
+
+    def test_calculate_checksume(self):
+        result = self.tarfixer.calculate_checksum(['\x01', '\x02', '\x03', '\x04'])
+        expected_result = 10 + ord(" ") * 8
+        self.assertEqual("%07o\x00" % expected_result, result)
+
+    def test_encode_header(self):
+        mode = 123
+        chunk = {
+            'mode': mode,
+            'name': 'hello',
+        }
+        result = self.tarfixer.encode_header(chunk, ['mode', 'name'])
+        expected_result = ["%07o\x00" % mode, 'hello']
+        self.assertEqual(result, expected_result)
--- a/tito.spec
+++ b/tito.spec
@ -112,7 +112,6 @@ rm -rf $RPM_BUILD_ROOT
 %doc %{_mandir}/man5/releasers.conf.5*
 %doc %{_mandir}/man8/tito.8*
 %{_bindir}/tito
-%{_bindir}/tar-fixup-stamp-comment.pl
 %{_bindir}/test-setup-specfile.pl
 %{_bindir}/generate-patches.pl
 %{_datadir}/bash-completion/completions/tito