Fix tarball timestamps from git archive with Python.

Tito passes "git archive" a tree ID.  The "git archive" man page states:

    git archive behaves differently when given a tree ID versus when
    given a commit ID or tag ID. In the first case the current time
    is used as the modification time of each file in the archive.

Using the current time means that every time we build the source
tarball, the file fingerprint will change since the metadata in the
tarball changes.  We don't want that since build systems track the
fingerprint to see if the actual source has changed.

This process was previously handled in an enigmatic Perl script that
lacked any comments whatsoever.  Converting it to well-commented Python
makes the process less mysterious and speedier since Tito doesn't need
to shell out to Perl.
This commit is contained in:
Alex Wood 2015-04-23 18:20:28 -04:00
parent 35baac73e0
commit 09d89eb4ef
8 changed files with 385 additions and 139 deletions

View file

@ -1,127 +0,0 @@
#!/usr/bin/perl
#
# Copyright (c) 2008-2009 Red Hat, Inc.
#
# This software is licensed to you under the GNU General Public License,
# version 2 (GPLv2). There is NO WARRANTY for this software, express or
# implied, including the implied warranties of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. You should have received a copy of GPLv2
# along with this software; if not, see
# http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
#
# Red Hat trademarks are not licensed under GPLv2. No permission is
# granted to use or replicate Red Hat trademarks that are incorporated
# in this software or its documentation.
use strict;
use warnings FATAL => 'all';
use IO::Handle ();
use constant RECORD_SIZE => 512;
use constant GIT_BLOCK_SIZE => RECORD_SIZE * 20;
my $stamp = shift;
if (not defined $stamp) {
die "Please specify stamp to put into the tar as the first parameter.\n";
}
my $stamp_octal = sprintf "%011o", $stamp;
my $comment = shift;
if (defined $comment) {
if (not $comment =~ /^[0-9a-f]{40}$/) {
die "The comment we will put into the tar should be SHA1 in hex (40 characters).\n";
}
}
my $chunk;
my $handle = \*STDIN;
my $read;
my $need_header = 1;
my $total_len = 0;
while ($read = $handle->sysread($chunk, RECORD_SIZE)) {
# print STDERR "read [$read]\n";
if ($read < RECORD_SIZE) {
my $rest = RECORD_SIZE - $read;
while (my $read = $handle->sysread($chunk, $rest, length($chunk))) {
# print STDERR " plus [$read]\n";
$rest -= $read;
}
}
if ($chunk eq "\0" x 512) {
# look for the second record full of zeroes
my $pad;
my $read = $handle->sysread($pad, RECORD_SIZE);
if ($read) {
if ($read < RECORD_SIZE) {
my $rest = RECORD_SIZE - $read;
while (my $read = $handle->sysread($pad, $rest, length($pad))) {
$rest -= $read;
}
}
}
if ($pad ne "\0" x 512) {
die "Failed to find second stop record.\n";
}
print $chunk;
print $pad;
$total_len += length($chunk) + length($pad);
print "\0" x (padded_record_size($total_len, GIT_BLOCK_SIZE) - $total_len);
exit;
}
my ($name, $data1, $size, $mtime, $checksum, $link, $name2, $data2) = unpack 'A100 A24 A12 A12 A8 A1 A100 a*', $chunk;
my $block_size = $size ? padded_record_size( oct $size ) : $size;
# print STDERR "[$name] [$size] [$mtime] [$checksum] [$link] [$name2] [$block_size]\n";
if ($need_header and $link ne 'g' and defined $comment) {
my $header = pack 'a100 a8 a8 a8 a12 a12 A8 a1 a100 a6 a2 a32 a32 a8 a8 a155 x12',
'pax_global_header', (sprintf "%07o", 0666), '0000000', '0000000',
'00000000064', $stamp_octal, '', 'g', '',
'ustar', '00', 'root', 'root', '0000000', '0000000', '';
substr($header, 148, 8) = sprintf("%07o\0", unpack("%16C*", $header));
print $header;
print pack "a512", "52 comment=$comment\n";
$need_header = 0;
$total_len += 2 * 512;
}
my $out = $chunk;
my $write_comment = 0;
if ($mtime) {
substr($out, 136, 12) = pack "a12", $stamp_octal;
substr($out, 148, 8) = pack "A8", "";
substr($out, 148, 8) = sprintf("%07o\0", unpack("%16C*", $out));
if ($link eq 'g' and oct $size == 52) {
$write_comment = 1;
}
}
print $out;
$total_len += length $out;
my $payload;
while (my $read = $handle->sysread( $payload, $block_size )) {
if (defined $comment and $write_comment) {
if ($read < 52) {
die "Would like to put SHA1 into header but did not read at least 52 bytes.\n";
}
if (not $payload =~ /^52 comment=/) {
die "The header payload is not [52 comment=].\n";
}
substr($payload, 0, 52) = "52 comment=$comment\n";
}
# print STDERR " payload [@{[ length $payload ]}]\n";
print $payload;
$total_len += length $payload;
$block_size -= $read;
last unless $block_size;
}
}
sub padded_record_size {
my $len = shift;
my $pad_size = shift || RECORD_SIZE;
my $out = int($len / $pad_size);
$out++ if $len % $pad_size;
return $out * $pad_size;
}

View file

@ -37,7 +37,6 @@ setup(
# non-python scripts go here
scripts=[
'bin/tito',
'bin/tar-fixup-stamp-comment.pl',
'bin/test-setup-specfile.pl',
'bin/generate-patches.pl'
],

View file

@ -24,6 +24,7 @@ from bugzilla.rhbugzilla import RHBugzilla
from tito.compat import xmlrpclib, getstatusoutput
from tito.exception import TitoException
from tito.exception import RunCommandException
from tito.tar import TarFixer, RECORD_SIZE
DEFAULT_BUILD_DIR = "/tmp/tito"
DEFAULT_BUILDER = "builder"
@ -645,16 +646,18 @@ def create_tgz(git_root, prefix, commit, relative_dir,
os.chdir(os.path.abspath(git_root))
timestamp = get_commit_timestamp(commit)
timestamp_script = get_script_path("tar-fixup-stamp-comment.pl")
# Accomodate standalone projects with specfile i root of git repo:
relative_git_dir = "%s" % relative_dir
if relative_git_dir in ['/', './']:
relative_git_dir = ""
basename = os.path.splitext(dest_tgz)[0]
initial_tar = "%s.initial" % basename
# command to generate a git-archive
git_archive_cmd = 'git archive --format=tar --prefix=%s/ %s:%s' % (
prefix, commit, relative_git_dir)
git_archive_cmd = 'git archive --format=tar --prefix=%s/ %s:%s --output=%s' % (
prefix, commit, relative_git_dir, initial_tar)
run_command(git_archive_cmd)
# Run git-archive separately if --debug was specified.
# This allows us to detect failure early.
@ -662,12 +665,16 @@ def create_tgz(git_root, prefix, commit, relative_dir,
debug('git-archive fails if relative dir is not in git tree',
'%s > /dev/null' % git_archive_cmd)
# If we're still alive, the previous command worked
archive_cmd = ('%s | %s %s %s | gzip -n -c - > %s' % (
git_archive_cmd, timestamp_script,
timestamp, commit, dest_tgz))
debug(archive_cmd)
return run_command(archive_cmd)
fixed_tar = "%s.tar" % basename
fixed_tar_fh = open(fixed_tar, 'wb')
try:
tarfixer = TarFixer(open(initial_tar, 'rb', RECORD_SIZE), fixed_tar_fh, timestamp, commit)
tarfixer.fix()
finally:
fixed_tar_fh.close()
# It's a pity we can't use Python's gzip, but it doesn't offer an equivalent of -n
return run_command("gzip -n -c < %s > %s" % (fixed_tar, dest_tgz))
def get_git_repo_url():

296
src/tito/tar.py Normal file
View file

@ -0,0 +1,296 @@
# Copyright (c) 2008-2009 Red Hat, Inc.
#
# This software is licensed to you under the GNU General Public License,
# version 2 (GPLv2). There is NO WARRANTY for this software, express or
# implied, including the implied warranties of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. You should have received a copy of GPLv2
# along with this software; if not, see
# http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
#
# Red Hat trademarks are not licensed under GPLv2. No permission is
# granted to use or replicate Red Hat trademarks that are incorporated
# in this software or its documentation.
import re
import struct
import sys
RECORD_SIZE = 512
# Git writes its tarballs to be a multiple of 10240. I'm not sure why: the
# implementation in archive-tar.c doesn't have any comments on the matter.
GIT_BLOCK_SIZE = RECORD_SIZE * 20
class TarFixer(object):
"""Code for updating a tar header's mtime. For details on the tar format
see http://www.gnu.org/software/tar/manual/html_node/Standard.html and
http://en.wikipedia.org/wiki/Tar_%28computing%29
Tito passes "git archive" a tree ID. The "git archive" man page states:
git archive behaves differently when given a tree ID versus when given
a commit ID or tag ID. In the first case the current time is used as
the modification time of each file in the archive.
Using the current time means that every time we build the source tarball,
the file fingerprint will change since the metadata in the tarball changes.
We don't want that since build systems track the fingerprint to see if
the actual source has changed.
The resultant tarball will be in this format:
- Global header (512 bytes)
- Extended header block with git ref (512 bytes)
- [File header (512 bytes) + File data padded to multiple of 512] * number of files
- 1024 NUL bytes
- However many NUL bytes are necessary to pad the file to a multiple of GIT_BLOCK_SIZE
The block after the global header with the git ref is called an "extended header".
We are technically writing a "pax" archive because of the use of extensions. According
to the comments in git's archive-tar.c
pax extended header records have the format "%u %s=%s\n". %u contains
the size of the whole string (including the %u), the first %s is the
keyword, the second one is the value.
"""
def __init__(self, fh, out, timestamp, gitref):
# As defined in tar.h
self.tar_struct = [
('name', '100s'),
('mode', '8s'),
('uid', '8s'),
('gid', '8s'),
('size', '12s'),
('mtime', '12s'),
('checksum', '8s'),
('typeflag', '1s'),
('linkname', '100s'),
('magic', '6s'),
('version', '2s'),
('uname', '32s'),
('gname', '32s'),
('devmajor', '8s'),
('devminor', '8s'),
('prefix', '155s'),
]
# The items in the list below are zero-padded octal numbers in ASCII.
# All other fields are null-terminated character strings. Each numeric
# field of width w contains w minus 1 digits, and a null.
#
# The checksum is technically an octal_member but we handle it specially.
self.octal_members = [
'mode',
'uid',
'gid',
'size',
'mtime',
'devmajor',
'devminor',
]
# Add an '=' to use native byte order with standard sizes
self.struct_template = "=" + "".join(map(lambda x: x[1], self.tar_struct))
self.struct_members = map(lambda x: x[0], self.tar_struct)
self.struct_hash = dict(self.tar_struct)
# The tarballs created by git archive from tree IDs don't have a global
# header for some reason.
self.need_header = True
self.done = False
# We need to track the total number of bytes we've written so we can
# pad out the final tarball to be a multiple of GIT_BLOCK_SIZE
self.total_length = 0
self.fh = fh
self.out = out
self.timestamp = int(timestamp)
self.gitref = gitref
def chunk_to_hash(self, chunk):
# Our struct template is only 500 bytes, but the last 12 bytes are NUL
# I elected to ignore them completely instead of including them in the
# template as '12x'. The unpack_from method will read the bytes our
# template defines from chunk and discard the rest.
unpacked = struct.unpack_from(self.struct_template, chunk)
# Zip what we read together with the member names and create a dictionary
chunk_props = dict(zip(self.struct_members, unpacked))
return chunk_props
def padded_size(self, length, pad_size=RECORD_SIZE):
"""Function to pad out a length to the nearest multiple of pad_size
that can contain it."""
blocks = length / pad_size
if length % pad_size != 0:
blocks += 1
return blocks * pad_size
def create_global_header(self):
header_props = {
'name': 'pax_global_header',
'mode': 0o666,
'uid': 0,
'gid': 0,
'size': 52, # The size of the extended header with the gitref
'mtime': self.timestamp,
'typeflag': 'g',
'linkname': '',
'magic': 'ustar',
'version': '00',
'uname': 'root',
'gname': 'root',
'devmajor': 0,
'devminor': 0,
'prefix': '',
}
values = self.encode_header(header_props, header_props.keys())
header_props['checksum'] = self.calculate_checksum(values)
self.process_header(header_props)
def encode_header(self, chunk_props, members=None):
if members is None:
members = self.struct_members
pack_values = []
for member in members:
if member in self.octal_members:
# Pad out the octal value to the right length
member_template = self.struct_hash[member]
size = int(re.match('(\d+)', member_template).group(1)) - 1
size = str(size)
fmt = "%0" + size + "o\x00"
pack_values.append(fmt % chunk_props[member])
else:
pack_values.append(chunk_props[member])
return pack_values
def process_header(self, chunk_props):
"""There is a header before every file and a global header at the top."""
pack_values = self.encode_header(chunk_props)
# The struct itself is only 500 bytes so we have to pad it to 512
data_out = struct.pack(self.struct_template + "12x", *pack_values)
self.out.write(data_out)
self.total_length += len(data_out)
def process_extended_header(self):
# Trash the original comment
_ = self.fh.read(RECORD_SIZE)
self.create_extended_header()
def create_extended_header(self):
# pax extended header records have the format "%u %s=%s\n". %u contains
# the size of the whole string (including the %u), the first %s is the
# keyword, the second one is the value.
#
# Since the git ref is always 40 characters we can
# pre-compute the length to put in the extended header
comment = "52 comment=%s\n" % self.gitref
data_out = struct.pack("=512s", comment)
self.out.write(data_out)
self.total_length += len(data_out)
def process_file_data(self, size):
data_out = self.fh.read(self.padded_size(size))
self.out.write(data_out)
self.total_length += len(data_out)
def calculate_checksum(self, values):
"""The checksum field is the ASCII representation of the octal value of the simple
sum of all bytes in the header block. Each 8-bit byte in the header is added
to an unsigned integer, initialized to zero, the precision of which shall be
no less than seventeen bits. When calculating the checksum, the checksum field is
treated as if it were all spaces.
Callers of this method are responsible for *not* sending in the previous checksum.
"""
new_chksum = 0
for val in values:
for x in val:
new_chksum += ord(x)
for blank in " " * 8:
new_chksum += ord(blank)
return "%07o\x00" % new_chksum
def process_chunk(self, chunk):
# Tar archives end with two 512 byte blocks of zeroes
if chunk == "\x00" * 512:
self.out.write(chunk)
self.total_length += len(chunk)
if self.last_chunk_was_nulls:
self.out.write("\x00" * (self.padded_size(self.total_length, GIT_BLOCK_SIZE) - self.total_length))
self.done = True
self.last_chunk_was_nulls = True
return
self.last_chunk_was_nulls = False
chunk_props = self.chunk_to_hash(chunk)
# This line is the whole purpose of this class!
chunk_props['mtime'] = "%011o\x00" % self.timestamp
# Delete the old checksum since it's now invalid and we don't want to pass
# it in to calculate_checksum().
del(chunk_props['checksum'])
chunk_props['checksum'] = self.calculate_checksum(chunk_props.values())
# Remove the trailing NUL byte(s) on the end of members
for k, v in chunk_props.items():
chunk_props[k] = v.rstrip("\x00")
for member in self.octal_members:
# Convert octals to decimal
chunk_props[member] = int(chunk_props[member], 8)
# If there is no global header, we need to create one
if self.need_header:
# When run against a tree ID, git archive doesn't create
# a global header. The first block is just the header for
# the first file.
if chunk_props['typeflag'] != 'g':
self.create_global_header()
self.create_extended_header()
self.process_header(chunk_props)
else:
self.process_header(chunk_props)
self.process_extended_header()
self.need_header = False
else:
self.process_header(chunk_props)
self.process_file_data(chunk_props['size'])
def fix(self):
try:
chunk = self.fh.read(RECORD_SIZE)
while chunk != "" and not self.done:
self.process_chunk(chunk)
chunk = self.fh.read(RECORD_SIZE)
finally:
self.fh.close()
if __name__ == '__main__':
if len(sys.argv) != 4:
sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE" % sys.argv[0])
try:
timestamp = int(sys.argv[1])
except:
sys.exit("UNIX_TIMESTAMP must be an integer")
gitref = sys.argv[2]
tar_file = sys.argv[3]
try:
fh = open(tar_file, 'rb', RECORD_SIZE)
except:
print("Could not read %s" % tar_file)
reader = TarFixer(fh, sys.stdout, timestamp, gitref)
reader.fix()

Binary file not shown.

Binary file not shown.

72
test/unit/test_tar.py Normal file
View file

@ -0,0 +1,72 @@
import hashlib
import os
import unittest
from StringIO import StringIO
from tito.tar import TarFixer
EXPECTED_TIMESTAMP = 1429725106
EXPECTED_REF = "3518d720bff20db887b7a5e5dddd411d14dca1f9"
class TarTest(unittest.TestCase):
def setUp(self):
self.out = StringIO()
self.tarfixer = TarFixer(None, self.out, EXPECTED_TIMESTAMP, EXPECTED_REF)
self.test_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive.tar')
self.reference_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive-fixed.tar')
self.reference_hash = self.hash_file(self.reference_file)
def tearDown(self):
self.out = None
def hash_file(self, filename):
return self.hash_buffer(open(filename, 'rb').read())
def hash_buffer(self, buf):
hasher = hashlib.sha256()
hasher.update(buf)
return hasher.hexdigest()
def test_fix(self):
self.fh = open(self.test_file)
self.tarfixer.fh = self.fh
self.tarfixer.fix()
self.assertEqual(self.reference_hash, self.hash_buffer("".join(self.out.buflist)))
def test_padded_size_length_small(self):
length = 10
block_size = 512
self.assertEqual(512, self.tarfixer.padded_size(length, block_size))
def test_padded_size_length_spot_on(self):
length = 512
block_size = 512
self.assertEqual(512, self.tarfixer.padded_size(length, block_size))
def test_padded_size_length_over(self):
length = 513
block_size = 512
self.assertEqual(1024, self.tarfixer.padded_size(length, block_size))
def test_create_extended_header(self):
self.tarfixer.create_extended_header()
header = "".join(self.out.buflist)
self.assertEqual(512, len(header))
self.assertEqual("52 comment=%s\n" % EXPECTED_REF, header[:52])
self.assertEqual("\x00" * (512 - 53), header[53:])
def test_calculate_checksume(self):
result = self.tarfixer.calculate_checksum(['\x01', '\x02', '\x03', '\x04'])
expected_result = 10 + ord(" ") * 8
self.assertEqual("%07o\x00" % expected_result, result)
def test_encode_header(self):
mode = 123
chunk = {
'mode': mode,
'name': 'hello',
}
result = self.tarfixer.encode_header(chunk, ['mode', 'name'])
expected_result = ["%07o\x00" % mode, 'hello']
self.assertEqual(result, expected_result)

View file

@ -112,7 +112,6 @@ rm -rf $RPM_BUILD_ROOT
%doc %{_mandir}/man5/releasers.conf.5*
%doc %{_mandir}/man8/tito.8*
%{_bindir}/tito
%{_bindir}/tar-fixup-stamp-comment.pl
%{_bindir}/test-setup-specfile.pl
%{_bindir}/generate-patches.pl
%{_datadir}/bash-completion/completions/tito