Fix Python 3 issues with binary versus string types.

Python 3 is very picky about not mixing binary and string data.  This
patch gets TarFixer running on both Python 2.6+ and Python 3.x.
This commit is contained in:
Alex Wood 2015-05-15 13:06:44 -04:00
parent 07d62cf24e
commit a5b43b6b96
3 changed files with 75 additions and 30 deletions

View file

@ -34,6 +34,20 @@ else:
import xmlrpc.client as xmlrpclib import xmlrpc.client as xmlrpclib
def decode_bytes(x, source_encoding):
if PY2:
return x
else:
return x.decode(source_encoding)
def encode_bytes(x, destination_encoding):
if PY2:
return x
else:
return bytes(x, destination_encoding)
def getstatusoutput(cmd): def getstatusoutput(cmd):
""" """
Returns (status, output) of executing cmd in a shell. Returns (status, output) of executing cmd in a shell.

View file

@ -15,6 +15,8 @@ import re
import struct import struct
import sys import sys
from tito.compat import decode_bytes, encode_bytes
RECORD_SIZE = 512 RECORD_SIZE = 512
# Git writes its tarballs to be a multiple of 10240. I'm not sure why: the # Git writes its tarballs to be a multiple of 10240. I'm not sure why: the
@ -53,6 +55,8 @@ class TarFixer(object):
pax extended header records have the format "%u %s=%s\n". %u contains pax extended header records have the format "%u %s=%s\n". %u contains
the size of the whole string (including the %u), the first %s is the the size of the whole string (including the %u), the first %s is the
keyword, the second one is the value. keyword, the second one is the value.
PAX (also known as POSIX.1-2001) always encodes everything in UTF-8.
""" """
def __init__(self, fh, out, timestamp, gitref, maven_built=False): def __init__(self, fh, out, timestamp, gitref, maven_built=False):
self.maven_built = maven_built self.maven_built = maven_built
@ -96,7 +100,7 @@ class TarFixer(object):
# Add an '=' to use native byte order with standard sizes # Add an '=' to use native byte order with standard sizes
self.struct_template = "=" + "".join(map(lambda x: x[1], self.tar_struct)) self.struct_template = "=" + "".join(map(lambda x: x[1], self.tar_struct))
self.struct_members = map(lambda x: x[0], self.tar_struct) self.struct_members = list(map(lambda x: x[0], self.tar_struct))
self.struct_hash = dict(self.tar_struct) self.struct_hash = dict(self.tar_struct)
# The tarballs created by git archive from tree IDs don't have a global # The tarballs created by git archive from tree IDs don't have a global
@ -128,13 +132,22 @@ class TarFixer(object):
return read return read
def write(self, data):
"""Write the data correctly depending on the mode of the file. While binary mode
is preferred, we support text mode for streams like stdout."""
if hasattr(self.out, 'mode') and 'b' in self.out.mode:
data = bytearray(data)
else:
data = decode_bytes(data, "utf8")
self.out.write(data)
def chunk_to_hash(self, chunk): def chunk_to_hash(self, chunk):
# Our struct template is only 500 bytes, but the last 12 bytes are NUL # Our struct template is only 500 bytes, but the last 12 bytes are NUL
# I elected to ignore them completely instead of including them in the # I elected to ignore them completely instead of including them in the
# template as '12x'. The unpack_from method will read the bytes our # template as '12x'. The unpack_from method will read the bytes our
# template defines from chunk and discard the rest. # template defines from chunk and discard the rest.
unpacked = struct.unpack_from(self.struct_template, chunk) unpacked = struct.unpack_from(self.struct_template, chunk)
unpacked = list(map(lambda x: decode_bytes(x, 'utf8'), unpacked))
# Zip what we read together with the member names and create a dictionary # Zip what we read together with the member names and create a dictionary
chunk_props = dict(zip(self.struct_members, unpacked)) chunk_props = dict(zip(self.struct_members, unpacked))
@ -143,28 +156,28 @@ class TarFixer(object):
def padded_size(self, length, pad_size=RECORD_SIZE): def padded_size(self, length, pad_size=RECORD_SIZE):
"""Function to pad out a length to the nearest multiple of pad_size """Function to pad out a length to the nearest multiple of pad_size
that can contain it.""" that can contain it."""
blocks = length / pad_size blocks = length // pad_size
if length % pad_size != 0: if length % pad_size != 0:
blocks += 1 blocks += 1
return blocks * pad_size return blocks * pad_size
def create_global_header(self): def create_global_header(self):
header_props = { header_props = {
'name': 'pax_global_header', 'name': u'pax_global_header',
'mode': 0o666, 'mode': 0o666,
'uid': 0, 'uid': 0,
'gid': 0, 'gid': 0,
'size': 52, # The size of the extended header with the gitref 'size': 52, # The size of the extended header with the gitref
'mtime': self.timestamp, 'mtime': self.timestamp,
'typeflag': 'g', 'typeflag': u'g',
'linkname': '', 'linkname': u'',
'magic': 'ustar', 'magic': u'ustar',
'version': '00', 'version': u'00',
'uname': 'root', 'uname': u'root',
'gname': 'root', 'gname': u'root',
'devmajor': 0, 'devmajor': 0,
'devminor': 0, 'devminor': 0,
'prefix': '', 'prefix': u'',
} }
self.process_header(header_props) self.process_header(header_props)
@ -179,9 +192,10 @@ class TarFixer(object):
member_template = self.struct_hash[member] member_template = self.struct_hash[member]
field_size = int(re.match('(\d+)', member_template).group(1)) - 1 field_size = int(re.match('(\d+)', member_template).group(1)) - 1
fmt = "%0" + str(field_size) + "o\x00" fmt = "%0" + str(field_size) + "o\x00"
pack_values.append(fmt % chunk_props[member]) as_string = fmt % chunk_props[member]
pack_values.append(as_string.encode("utf8"))
else: else:
pack_values.append(chunk_props[member]) pack_values.append(chunk_props[member].encode("utf8"))
return pack_values return pack_values
def process_header(self, chunk_props): def process_header(self, chunk_props):
@ -191,7 +205,7 @@ class TarFixer(object):
# The struct itself is only 500 bytes so we have to pad it to 512 # The struct itself is only 500 bytes so we have to pad it to 512
data_out = struct.pack(self.struct_template + "12x", *pack_values) data_out = struct.pack(self.struct_template + "12x", *pack_values)
self.out.write(data_out) self.write(data_out)
self.total_length += len(data_out) self.total_length += len(data_out)
def process_extended_header(self): def process_extended_header(self):
@ -207,13 +221,13 @@ class TarFixer(object):
# Since the git ref is always 40 characters we can # Since the git ref is always 40 characters we can
# pre-compute the length to put in the extended header # pre-compute the length to put in the extended header
comment = "52 comment=%s\n" % self.gitref comment = "52 comment=%s\n" % self.gitref
data_out = struct.pack("=512s", comment) data_out = struct.pack("=52s460x", encode_bytes(comment, "ascii"))
self.out.write(data_out) self.write(data_out)
self.total_length += len(data_out) self.total_length += len(data_out)
def process_file_data(self, size): def process_file_data(self, size):
data_out = self.full_read(self.padded_size(size)) data_out = self.full_read(self.padded_size(size))
self.out.write(data_out) self.write(data_out)
self.total_length += len(data_out) self.total_length += len(data_out)
def calculate_checksum(self, chunk_props): def calculate_checksum(self, chunk_props):
@ -227,17 +241,19 @@ class TarFixer(object):
values = self.encode_header(chunk_props) values = self.encode_header(chunk_props)
new_chksum = 0 new_chksum = 0
for val in values: for val in values:
val_bytes = bytearray(val, 'ASCII') val_bytes = val.decode("utf8")
new_chksum += reduce(lambda x, y: x + y, val_bytes, 0) for b in val_bytes:
new_chksum += ord(b)
return "%07o\x00" % new_chksum return "%07o\x00" % new_chksum
def process_chunk(self, chunk): def process_chunk(self, chunk):
# Tar archives end with two 512 byte blocks of zeroes # Tar archives end with two 512 byte blocks of zeroes
if chunk == "\x00" * 512: if chunk == b"\x00" * 512:
self.out.write(chunk) self.write(b"\x00" * 512)
self.total_length += len(chunk) self.total_length += len(chunk)
if self.last_chunk_was_nulls: if self.last_chunk_was_nulls:
self.out.write("\x00" * (self.padded_size(self.total_length, GIT_BLOCK_SIZE) - self.total_length)) final_padding = b"\x00" * (self.padded_size(self.total_length, GIT_BLOCK_SIZE) - self.total_length)
self.write(final_padding)
self.done = True self.done = True
self.last_chunk_was_nulls = True self.last_chunk_was_nulls = True
return return
@ -299,6 +315,9 @@ class TarFixer(object):
self.process_file_data(chunk_props['size']) self.process_file_data(chunk_props['size'])
def fix(self): def fix(self):
if 'b' not in self.fh.mode:
raise IOError("The input file must be opened in binary mode!")
try: try:
chunk = self.full_read(RECORD_SIZE) chunk = self.full_read(RECORD_SIZE)
while chunk != "" and not self.done: while chunk != "" and not self.done:

View file

@ -2,7 +2,7 @@ import hashlib
import os import os
import unittest import unittest
from tito.compat import StringIO from tito.compat import StringIO, encode_bytes
from tito.tar import TarFixer from tito.tar import TarFixer
from mock import Mock from mock import Mock
@ -22,7 +22,8 @@ class TarTest(unittest.TestCase):
self.out = None self.out = None
def hash_file(self, filename): def hash_file(self, filename):
return self.hash_buffer(open(filename, 'rb').read()) file_bytes = open(filename, 'rb').read()
return self.hash_buffer(file_bytes)
def hash_buffer(self, buf): def hash_buffer(self, buf):
hasher = hashlib.sha256() hasher = hashlib.sha256()
@ -61,10 +62,15 @@ class TarTest(unittest.TestCase):
self.assertRaises(IOError, self.tarfixer.full_read, 10) self.assertRaises(IOError, self.tarfixer.full_read, 10)
def test_fix(self): def test_fix(self):
self.fh = open(self.test_file) self.fh = open(self.test_file, 'rb')
self.tarfixer.fh = self.fh self.tarfixer.fh = self.fh
self.tarfixer.fix() self.tarfixer.fix()
self.assertEqual(self.reference_hash, self.hash_buffer("".join(self.out.buflist))) self.assertEqual(self.reference_hash, self.hash_buffer(encode_bytes(self.out.getvalue(), "utf8")))
def test_fix_fails_unless_file_in_binary_mode(self):
self.fh = open(self.test_file, 'r')
self.tarfixer.fh = self.fh
self.assertRaises(IOError, self.tarfixer.fix)
def test_padded_size_length_small(self): def test_padded_size_length_small(self):
length = 10 length = 10
@ -81,9 +87,14 @@ class TarTest(unittest.TestCase):
block_size = 512 block_size = 512
self.assertEqual(1024, self.tarfixer.padded_size(length, block_size)) self.assertEqual(1024, self.tarfixer.padded_size(length, block_size))
def test_padded_size_length_long(self):
length = 82607
block_size = 512
self.assertEqual(82944, self.tarfixer.padded_size(length, block_size))
def test_create_extended_header(self): def test_create_extended_header(self):
self.tarfixer.create_extended_header() self.tarfixer.create_extended_header()
header = "".join(self.out.buflist) header = self.out.getvalue()
self.assertEqual(512, len(header)) self.assertEqual(512, len(header))
self.assertEqual("52 comment=%s\n" % EXPECTED_REF, header[:52]) self.assertEqual("52 comment=%s\n" % EXPECTED_REF, header[:52])
self.assertEqual("\x00" * (512 - 53), header[53:]) self.assertEqual("\x00" * (512 - 53), header[53:])
@ -95,7 +106,7 @@ class TarTest(unittest.TestCase):
'c': '\x03', 'c': '\x03',
'd': '\x04', 'd': '\x04',
} }
self.tarfixer.struct_members = fields.keys() + ['checksum'] self.tarfixer.struct_members = list(fields.keys()) + ['checksum']
result = self.tarfixer.calculate_checksum(fields) result = self.tarfixer.calculate_checksum(fields)
expected_result = 10 + ord(" ") * 8 expected_result = 10 + ord(" ") * 8
self.assertEqual("%07o\x00" % expected_result, result) self.assertEqual("%07o\x00" % expected_result, result)
@ -107,5 +118,6 @@ class TarTest(unittest.TestCase):
'name': 'hello', 'name': 'hello',
} }
result = self.tarfixer.encode_header(chunk, ['mode', 'name']) result = self.tarfixer.encode_header(chunk, ['mode', 'name'])
expected_result = ["%07o\x00" % mode, 'hello'] expected_result = ["%07o\x00" % mode, "hello"]
self.assertEqual(result, expected_result) expected_result = list(map(lambda x: encode_bytes(x, "utf8"), expected_result))
self.assertEqual(expected_result, result)