From a5b43b6b96692c489a2398c85f7e2ac48d054677 Mon Sep 17 00:00:00 2001 From: Alex Wood Date: Fri, 15 May 2015 13:06:44 -0400 Subject: [PATCH] Fix Python 3 issues with binary versus string types. Python 3 is very picky about not mixing binary and string data. This patch gets TarFixer running on both Python 2.6+ and Python 3.x. --- src/tito/compat.py | 14 ++++++++++ src/tito/tar.py | 63 ++++++++++++++++++++++++++++--------------- test/unit/test_tar.py | 28 +++++++++++++------ 3 files changed, 75 insertions(+), 30 deletions(-) diff --git a/src/tito/compat.py b/src/tito/compat.py index b29b416..a9ec6cd 100644 --- a/src/tito/compat.py +++ b/src/tito/compat.py @@ -34,6 +34,20 @@ else: import xmlrpc.client as xmlrpclib +def decode_bytes(x, source_encoding): + if PY2: + return x + else: + return x.decode(source_encoding) + + +def encode_bytes(x, destination_encoding): + if PY2: + return x + else: + return bytes(x, destination_encoding) + + def getstatusoutput(cmd): """ Returns (status, output) of executing cmd in a shell. diff --git a/src/tito/tar.py b/src/tito/tar.py index 5b579fd..6d93c15 100644 --- a/src/tito/tar.py +++ b/src/tito/tar.py @@ -15,6 +15,8 @@ import re import struct import sys +from tito.compat import decode_bytes, encode_bytes + RECORD_SIZE = 512 # Git writes its tarballs to be a multiple of 10240. I'm not sure why: the @@ -53,6 +55,8 @@ class TarFixer(object): pax extended header records have the format "%u %s=%s\n". %u contains the size of the whole string (including the %u), the first %s is the keyword, the second one is the value. + + PAX (also known as POSIX.1-2001) always encodes everything in UTF-8. """ def __init__(self, fh, out, timestamp, gitref, maven_built=False): self.maven_built = maven_built @@ -96,7 +100,7 @@ class TarFixer(object): # Add an '=' to use native byte order with standard sizes self.struct_template = "=" + "".join(map(lambda x: x[1], self.tar_struct)) - self.struct_members = map(lambda x: x[0], self.tar_struct) + self.struct_members = list(map(lambda x: x[0], self.tar_struct)) self.struct_hash = dict(self.tar_struct) # The tarballs created by git archive from tree IDs don't have a global @@ -128,13 +132,22 @@ class TarFixer(object): return read + def write(self, data): + """Write the data correctly depending on the mode of the file. While binary mode + is preferred, we support text mode for streams like stdout.""" + if hasattr(self.out, 'mode') and 'b' in self.out.mode: + data = bytearray(data) + else: + data = decode_bytes(data, "utf8") + self.out.write(data) + def chunk_to_hash(self, chunk): # Our struct template is only 500 bytes, but the last 12 bytes are NUL # I elected to ignore them completely instead of including them in the # template as '12x'. The unpack_from method will read the bytes our # template defines from chunk and discard the rest. unpacked = struct.unpack_from(self.struct_template, chunk) - + unpacked = list(map(lambda x: decode_bytes(x, 'utf8'), unpacked)) # Zip what we read together with the member names and create a dictionary chunk_props = dict(zip(self.struct_members, unpacked)) @@ -143,28 +156,28 @@ class TarFixer(object): def padded_size(self, length, pad_size=RECORD_SIZE): """Function to pad out a length to the nearest multiple of pad_size that can contain it.""" - blocks = length / pad_size + blocks = length // pad_size if length % pad_size != 0: blocks += 1 return blocks * pad_size def create_global_header(self): header_props = { - 'name': 'pax_global_header', + 'name': u'pax_global_header', 'mode': 0o666, 'uid': 0, 'gid': 0, 'size': 52, # The size of the extended header with the gitref 'mtime': self.timestamp, - 'typeflag': 'g', - 'linkname': '', - 'magic': 'ustar', - 'version': '00', - 'uname': 'root', - 'gname': 'root', + 'typeflag': u'g', + 'linkname': u'', + 'magic': u'ustar', + 'version': u'00', + 'uname': u'root', + 'gname': u'root', 'devmajor': 0, 'devminor': 0, - 'prefix': '', + 'prefix': u'', } self.process_header(header_props) @@ -179,9 +192,10 @@ class TarFixer(object): member_template = self.struct_hash[member] field_size = int(re.match('(\d+)', member_template).group(1)) - 1 fmt = "%0" + str(field_size) + "o\x00" - pack_values.append(fmt % chunk_props[member]) + as_string = fmt % chunk_props[member] + pack_values.append(as_string.encode("utf8")) else: - pack_values.append(chunk_props[member]) + pack_values.append(chunk_props[member].encode("utf8")) return pack_values def process_header(self, chunk_props): @@ -191,7 +205,7 @@ class TarFixer(object): # The struct itself is only 500 bytes so we have to pad it to 512 data_out = struct.pack(self.struct_template + "12x", *pack_values) - self.out.write(data_out) + self.write(data_out) self.total_length += len(data_out) def process_extended_header(self): @@ -207,13 +221,13 @@ class TarFixer(object): # Since the git ref is always 40 characters we can # pre-compute the length to put in the extended header comment = "52 comment=%s\n" % self.gitref - data_out = struct.pack("=512s", comment) - self.out.write(data_out) + data_out = struct.pack("=52s460x", encode_bytes(comment, "ascii")) + self.write(data_out) self.total_length += len(data_out) def process_file_data(self, size): data_out = self.full_read(self.padded_size(size)) - self.out.write(data_out) + self.write(data_out) self.total_length += len(data_out) def calculate_checksum(self, chunk_props): @@ -227,17 +241,19 @@ class TarFixer(object): values = self.encode_header(chunk_props) new_chksum = 0 for val in values: - val_bytes = bytearray(val, 'ASCII') - new_chksum += reduce(lambda x, y: x + y, val_bytes, 0) + val_bytes = val.decode("utf8") + for b in val_bytes: + new_chksum += ord(b) return "%07o\x00" % new_chksum def process_chunk(self, chunk): # Tar archives end with two 512 byte blocks of zeroes - if chunk == "\x00" * 512: - self.out.write(chunk) + if chunk == b"\x00" * 512: + self.write(b"\x00" * 512) self.total_length += len(chunk) if self.last_chunk_was_nulls: - self.out.write("\x00" * (self.padded_size(self.total_length, GIT_BLOCK_SIZE) - self.total_length)) + final_padding = b"\x00" * (self.padded_size(self.total_length, GIT_BLOCK_SIZE) - self.total_length) + self.write(final_padding) self.done = True self.last_chunk_was_nulls = True return @@ -299,6 +315,9 @@ class TarFixer(object): self.process_file_data(chunk_props['size']) def fix(self): + if 'b' not in self.fh.mode: + raise IOError("The input file must be opened in binary mode!") + try: chunk = self.full_read(RECORD_SIZE) while chunk != "" and not self.done: diff --git a/test/unit/test_tar.py b/test/unit/test_tar.py index f395bf7..6007298 100644 --- a/test/unit/test_tar.py +++ b/test/unit/test_tar.py @@ -2,7 +2,7 @@ import hashlib import os import unittest -from tito.compat import StringIO +from tito.compat import StringIO, encode_bytes from tito.tar import TarFixer from mock import Mock @@ -22,7 +22,8 @@ class TarTest(unittest.TestCase): self.out = None def hash_file(self, filename): - return self.hash_buffer(open(filename, 'rb').read()) + file_bytes = open(filename, 'rb').read() + return self.hash_buffer(file_bytes) def hash_buffer(self, buf): hasher = hashlib.sha256() @@ -61,10 +62,15 @@ class TarTest(unittest.TestCase): self.assertRaises(IOError, self.tarfixer.full_read, 10) def test_fix(self): - self.fh = open(self.test_file) + self.fh = open(self.test_file, 'rb') self.tarfixer.fh = self.fh self.tarfixer.fix() - self.assertEqual(self.reference_hash, self.hash_buffer("".join(self.out.buflist))) + self.assertEqual(self.reference_hash, self.hash_buffer(encode_bytes(self.out.getvalue(), "utf8"))) + + def test_fix_fails_unless_file_in_binary_mode(self): + self.fh = open(self.test_file, 'r') + self.tarfixer.fh = self.fh + self.assertRaises(IOError, self.tarfixer.fix) def test_padded_size_length_small(self): length = 10 @@ -81,9 +87,14 @@ class TarTest(unittest.TestCase): block_size = 512 self.assertEqual(1024, self.tarfixer.padded_size(length, block_size)) + def test_padded_size_length_long(self): + length = 82607 + block_size = 512 + self.assertEqual(82944, self.tarfixer.padded_size(length, block_size)) + def test_create_extended_header(self): self.tarfixer.create_extended_header() - header = "".join(self.out.buflist) + header = self.out.getvalue() self.assertEqual(512, len(header)) self.assertEqual("52 comment=%s\n" % EXPECTED_REF, header[:52]) self.assertEqual("\x00" * (512 - 53), header[53:]) @@ -95,7 +106,7 @@ class TarTest(unittest.TestCase): 'c': '\x03', 'd': '\x04', } - self.tarfixer.struct_members = fields.keys() + ['checksum'] + self.tarfixer.struct_members = list(fields.keys()) + ['checksum'] result = self.tarfixer.calculate_checksum(fields) expected_result = 10 + ord(" ") * 8 self.assertEqual("%07o\x00" % expected_result, result) @@ -107,5 +118,6 @@ class TarTest(unittest.TestCase): 'name': 'hello', } result = self.tarfixer.encode_header(chunk, ['mode', 'name']) - expected_result = ["%07o\x00" % mode, 'hello'] - self.assertEqual(result, expected_result) + expected_result = ["%07o\x00" % mode, "hello"] + expected_result = list(map(lambda x: encode_bytes(x, "utf8"), expected_result)) + self.assertEqual(expected_result, result)