Fix Python 3 issues with binary versus string types.

Python 3 is very picky about not mixing binary and string data.  This
patch gets TarFixer running on both Python 2.6+ and Python 3.x.
This commit is contained in:
Alex Wood 2015-05-15 13:06:44 -04:00
parent 07d62cf24e
commit a5b43b6b96
3 changed files with 75 additions and 30 deletions

View file

@ -34,6 +34,20 @@ else:
import xmlrpc.client as xmlrpclib
def decode_bytes(x, source_encoding):
if PY2:
return x
else:
return x.decode(source_encoding)
def encode_bytes(x, destination_encoding):
if PY2:
return x
else:
return bytes(x, destination_encoding)
def getstatusoutput(cmd):
"""
Returns (status, output) of executing cmd in a shell.

View file

@ -15,6 +15,8 @@ import re
import struct
import sys
from tito.compat import decode_bytes, encode_bytes
RECORD_SIZE = 512
# Git writes its tarballs to be a multiple of 10240. I'm not sure why: the
@ -53,6 +55,8 @@ class TarFixer(object):
pax extended header records have the format "%u %s=%s\n". %u contains
the size of the whole string (including the %u), the first %s is the
keyword, the second one is the value.
PAX (also known as POSIX.1-2001) always encodes everything in UTF-8.
"""
def __init__(self, fh, out, timestamp, gitref, maven_built=False):
self.maven_built = maven_built
@ -96,7 +100,7 @@ class TarFixer(object):
# Add an '=' to use native byte order with standard sizes
self.struct_template = "=" + "".join(map(lambda x: x[1], self.tar_struct))
self.struct_members = map(lambda x: x[0], self.tar_struct)
self.struct_members = list(map(lambda x: x[0], self.tar_struct))
self.struct_hash = dict(self.tar_struct)
# The tarballs created by git archive from tree IDs don't have a global
@ -128,13 +132,22 @@ class TarFixer(object):
return read
def write(self, data):
"""Write the data correctly depending on the mode of the file. While binary mode
is preferred, we support text mode for streams like stdout."""
if hasattr(self.out, 'mode') and 'b' in self.out.mode:
data = bytearray(data)
else:
data = decode_bytes(data, "utf8")
self.out.write(data)
def chunk_to_hash(self, chunk):
# Our struct template is only 500 bytes, but the last 12 bytes are NUL
# I elected to ignore them completely instead of including them in the
# template as '12x'. The unpack_from method will read the bytes our
# template defines from chunk and discard the rest.
unpacked = struct.unpack_from(self.struct_template, chunk)
unpacked = list(map(lambda x: decode_bytes(x, 'utf8'), unpacked))
# Zip what we read together with the member names and create a dictionary
chunk_props = dict(zip(self.struct_members, unpacked))
@ -143,28 +156,28 @@ class TarFixer(object):
def padded_size(self, length, pad_size=RECORD_SIZE):
"""Function to pad out a length to the nearest multiple of pad_size
that can contain it."""
blocks = length / pad_size
blocks = length // pad_size
if length % pad_size != 0:
blocks += 1
return blocks * pad_size
def create_global_header(self):
header_props = {
'name': 'pax_global_header',
'name': u'pax_global_header',
'mode': 0o666,
'uid': 0,
'gid': 0,
'size': 52, # The size of the extended header with the gitref
'mtime': self.timestamp,
'typeflag': 'g',
'linkname': '',
'magic': 'ustar',
'version': '00',
'uname': 'root',
'gname': 'root',
'typeflag': u'g',
'linkname': u'',
'magic': u'ustar',
'version': u'00',
'uname': u'root',
'gname': u'root',
'devmajor': 0,
'devminor': 0,
'prefix': '',
'prefix': u'',
}
self.process_header(header_props)
@ -179,9 +192,10 @@ class TarFixer(object):
member_template = self.struct_hash[member]
field_size = int(re.match('(\d+)', member_template).group(1)) - 1
fmt = "%0" + str(field_size) + "o\x00"
pack_values.append(fmt % chunk_props[member])
as_string = fmt % chunk_props[member]
pack_values.append(as_string.encode("utf8"))
else:
pack_values.append(chunk_props[member])
pack_values.append(chunk_props[member].encode("utf8"))
return pack_values
def process_header(self, chunk_props):
@ -191,7 +205,7 @@ class TarFixer(object):
# The struct itself is only 500 bytes so we have to pad it to 512
data_out = struct.pack(self.struct_template + "12x", *pack_values)
self.out.write(data_out)
self.write(data_out)
self.total_length += len(data_out)
def process_extended_header(self):
@ -207,13 +221,13 @@ class TarFixer(object):
# Since the git ref is always 40 characters we can
# pre-compute the length to put in the extended header
comment = "52 comment=%s\n" % self.gitref
data_out = struct.pack("=512s", comment)
self.out.write(data_out)
data_out = struct.pack("=52s460x", encode_bytes(comment, "ascii"))
self.write(data_out)
self.total_length += len(data_out)
def process_file_data(self, size):
data_out = self.full_read(self.padded_size(size))
self.out.write(data_out)
self.write(data_out)
self.total_length += len(data_out)
def calculate_checksum(self, chunk_props):
@ -227,17 +241,19 @@ class TarFixer(object):
values = self.encode_header(chunk_props)
new_chksum = 0
for val in values:
val_bytes = bytearray(val, 'ASCII')
new_chksum += reduce(lambda x, y: x + y, val_bytes, 0)
val_bytes = val.decode("utf8")
for b in val_bytes:
new_chksum += ord(b)
return "%07o\x00" % new_chksum
def process_chunk(self, chunk):
# Tar archives end with two 512 byte blocks of zeroes
if chunk == "\x00" * 512:
self.out.write(chunk)
if chunk == b"\x00" * 512:
self.write(b"\x00" * 512)
self.total_length += len(chunk)
if self.last_chunk_was_nulls:
self.out.write("\x00" * (self.padded_size(self.total_length, GIT_BLOCK_SIZE) - self.total_length))
final_padding = b"\x00" * (self.padded_size(self.total_length, GIT_BLOCK_SIZE) - self.total_length)
self.write(final_padding)
self.done = True
self.last_chunk_was_nulls = True
return
@ -299,6 +315,9 @@ class TarFixer(object):
self.process_file_data(chunk_props['size'])
def fix(self):
if 'b' not in self.fh.mode:
raise IOError("The input file must be opened in binary mode!")
try:
chunk = self.full_read(RECORD_SIZE)
while chunk != "" and not self.done:

View file

@ -2,7 +2,7 @@ import hashlib
import os
import unittest
from tito.compat import StringIO
from tito.compat import StringIO, encode_bytes
from tito.tar import TarFixer
from mock import Mock
@ -22,7 +22,8 @@ class TarTest(unittest.TestCase):
self.out = None
def hash_file(self, filename):
return self.hash_buffer(open(filename, 'rb').read())
file_bytes = open(filename, 'rb').read()
return self.hash_buffer(file_bytes)
def hash_buffer(self, buf):
hasher = hashlib.sha256()
@ -61,10 +62,15 @@ class TarTest(unittest.TestCase):
self.assertRaises(IOError, self.tarfixer.full_read, 10)
def test_fix(self):
self.fh = open(self.test_file)
self.fh = open(self.test_file, 'rb')
self.tarfixer.fh = self.fh
self.tarfixer.fix()
self.assertEqual(self.reference_hash, self.hash_buffer("".join(self.out.buflist)))
self.assertEqual(self.reference_hash, self.hash_buffer(encode_bytes(self.out.getvalue(), "utf8")))
def test_fix_fails_unless_file_in_binary_mode(self):
self.fh = open(self.test_file, 'r')
self.tarfixer.fh = self.fh
self.assertRaises(IOError, self.tarfixer.fix)
def test_padded_size_length_small(self):
length = 10
@ -81,9 +87,14 @@ class TarTest(unittest.TestCase):
block_size = 512
self.assertEqual(1024, self.tarfixer.padded_size(length, block_size))
def test_padded_size_length_long(self):
length = 82607
block_size = 512
self.assertEqual(82944, self.tarfixer.padded_size(length, block_size))
def test_create_extended_header(self):
self.tarfixer.create_extended_header()
header = "".join(self.out.buflist)
header = self.out.getvalue()
self.assertEqual(512, len(header))
self.assertEqual("52 comment=%s\n" % EXPECTED_REF, header[:52])
self.assertEqual("\x00" * (512 - 53), header[53:])
@ -95,7 +106,7 @@ class TarTest(unittest.TestCase):
'c': '\x03',
'd': '\x04',
}
self.tarfixer.struct_members = fields.keys() + ['checksum']
self.tarfixer.struct_members = list(fields.keys()) + ['checksum']
result = self.tarfixer.calculate_checksum(fields)
expected_result = 10 + ord(" ") * 8
self.assertEqual("%07o\x00" % expected_result, result)
@ -107,5 +118,6 @@ class TarTest(unittest.TestCase):
'name': 'hello',
}
result = self.tarfixer.encode_header(chunk, ['mode', 'name'])
expected_result = ["%07o\x00" % mode, 'hello']
self.assertEqual(result, expected_result)
expected_result = ["%07o\x00" % mode, "hello"]
expected_result = list(map(lambda x: encode_bytes(x, "utf8"), expected_result))
self.assertEqual(expected_result, result)