Fix Python 3 issues with binary versus string types.

Python 3 is very picky about not mixing binary and string data. This patch gets TarFixer running on both Python 2.6+ and Python 3.x.
2025-02-23 12:12:47 +00:00 · 2015-05-15 13:06:44 -04:00 · 2015-05-15 13:06:44 -04:00 · a5b43b6b96
commit a5b43b6b96
parent 07d62cf24e
3 changed files with 75 additions and 30 deletions
--- a/src/tito/compat.py
+++ b/src/tito/compat.py
@ -34,6 +34,20 @@ else:
    import xmlrpc.client as xmlrpclib
 def decode_bytes(x, source_encoding):
    if PY2:
        return x
    else:
        return x.decode(source_encoding)
 def encode_bytes(x, destination_encoding):
    if PY2:
        return x
    else:
        return bytes(x, destination_encoding)
 def getstatusoutput(cmd):
    """
    Returns (status, output) of executing cmd in a shell.
--- a/src/tito/tar.py
+++ b/src/tito/tar.py
@ -15,6 +15,8 @@ import re
 import struct
 import sys
 from tito.compat import decode_bytes, encode_bytes
 RECORD_SIZE = 512
 # Git writes its tarballs to be a multiple of 10240.  I'm not sure why: the
@ -53,6 +55,8 @@ class TarFixer(object):
        pax extended header records have the format "%u %s=%s\n".  %u contains
        the size of the whole string (including the %u), the first %s is the
        keyword, the second one is the value.
    PAX (also known as POSIX.1-2001) always encodes everything in UTF-8.
    """
    def __init__(self, fh, out, timestamp, gitref, maven_built=False):
        self.maven_built = maven_built
@ -96,7 +100,7 @@ class TarFixer(object):
        # Add an '=' to use native byte order with standard sizes
        self.struct_template = "=" + "".join(map(lambda x: x[1], self.tar_struct))
-        self.struct_members = map(lambda x: x[0], self.tar_struct)
+        self.struct_members = list(map(lambda x: x[0], self.tar_struct))
        self.struct_hash = dict(self.tar_struct)
        # The tarballs created by git archive from tree IDs don't have a global
@ -128,13 +132,22 @@ class TarFixer(object):
        return read
    def write(self, data):
        """Write the data correctly depending on the mode of the file.  While binary mode
        is preferred, we support text mode for streams like stdout."""
        if hasattr(self.out, 'mode') and 'b' in self.out.mode:
            data = bytearray(data)
        else:
            data = decode_bytes(data, "utf8")
        self.out.write(data)
    def chunk_to_hash(self, chunk):
        # Our struct template is only 500 bytes, but the last 12 bytes are NUL
        # I elected to ignore them completely instead of including them in the
        # template as '12x'.  The unpack_from method will read the bytes our
        # template defines from chunk and discard the rest.
        unpacked = struct.unpack_from(self.struct_template, chunk)
-
+        unpacked = list(map(lambda x: decode_bytes(x, 'utf8'), unpacked))
        # Zip what we read together with the member names and create a dictionary
        chunk_props = dict(zip(self.struct_members, unpacked))
@ -143,28 +156,28 @@ class TarFixer(object):
    def padded_size(self, length, pad_size=RECORD_SIZE):
        """Function to pad out a length to the nearest multiple of pad_size
        that can contain it."""
-        blocks = length / pad_size
+        blocks = length // pad_size
        if length % pad_size != 0:
            blocks += 1
        return blocks * pad_size
    def create_global_header(self):
        header_props = {
-            'name': 'pax_global_header',
+            'name': u'pax_global_header',
            'mode': 0o666,
            'uid': 0,
            'gid': 0,
            'size': 52,  # The size of the extended header with the gitref
            'mtime': self.timestamp,
-            'typeflag': 'g',
+            'typeflag': u'g',
-            'linkname': '',
+            'linkname': u'',
-            'magic': 'ustar',
+            'magic': u'ustar',
-            'version': '00',
+            'version': u'00',
-            'uname': 'root',
+            'uname': u'root',
-            'gname': 'root',
+            'gname': u'root',
            'devmajor': 0,
            'devminor': 0,
-            'prefix': '',
+            'prefix': u'',
        }
        self.process_header(header_props)
@ -179,9 +192,10 @@ class TarFixer(object):
                member_template = self.struct_hash[member]
                field_size = int(re.match('(\d+)', member_template).group(1)) - 1
                fmt = "%0" + str(field_size) + "o\x00"
-                pack_values.append(fmt % chunk_props[member])
+                as_string = fmt % chunk_props[member]
                pack_values.append(as_string.encode("utf8"))
            else:
-                pack_values.append(chunk_props[member])
+                pack_values.append(chunk_props[member].encode("utf8"))
        return pack_values
    def process_header(self, chunk_props):
@ -191,7 +205,7 @@ class TarFixer(object):
        # The struct itself is only 500 bytes so we have to pad it to 512
        data_out = struct.pack(self.struct_template + "12x", *pack_values)
-        self.out.write(data_out)
+        self.write(data_out)
        self.total_length += len(data_out)
    def process_extended_header(self):
@ -207,13 +221,13 @@ class TarFixer(object):
        # Since the git ref is always 40 characters we can
        # pre-compute the length to put in the extended header
        comment = "52 comment=%s\n" % self.gitref
-        data_out = struct.pack("=512s", comment)
+        data_out = struct.pack("=52s460x", encode_bytes(comment, "ascii"))
-        self.out.write(data_out)
+        self.write(data_out)
        self.total_length += len(data_out)
    def process_file_data(self, size):
        data_out = self.full_read(self.padded_size(size))
-        self.out.write(data_out)
+        self.write(data_out)
        self.total_length += len(data_out)
    def calculate_checksum(self, chunk_props):
@ -227,17 +241,19 @@ class TarFixer(object):
        values = self.encode_header(chunk_props)
        new_chksum = 0
        for val in values:
-            val_bytes = bytearray(val, 'ASCII')
+            val_bytes = val.decode("utf8")
-            new_chksum += reduce(lambda x, y: x + y, val_bytes, 0)
+            for b in val_bytes:
                new_chksum += ord(b)
        return "%07o\x00" % new_chksum
    def process_chunk(self, chunk):
        # Tar archives end with two 512 byte blocks of zeroes
-        if chunk == "\x00" * 512:
+        if chunk == b"\x00" * 512:
-            self.out.write(chunk)
+            self.write(b"\x00" * 512)
            self.total_length += len(chunk)
            if self.last_chunk_was_nulls:
-                self.out.write("\x00" * (self.padded_size(self.total_length, GIT_BLOCK_SIZE) - self.total_length))
+                final_padding = b"\x00" * (self.padded_size(self.total_length, GIT_BLOCK_SIZE) - self.total_length)
                self.write(final_padding)
                self.done = True
            self.last_chunk_was_nulls = True
            return
@ -299,6 +315,9 @@ class TarFixer(object):
            self.process_file_data(chunk_props['size'])
    def fix(self):
        if 'b' not in self.fh.mode:
            raise IOError("The input file must be opened in binary mode!")
        try:
            chunk = self.full_read(RECORD_SIZE)
            while chunk != "" and not self.done:
--- a/test/unit/test_tar.py
+++ b/test/unit/test_tar.py
@ -2,7 +2,7 @@ import hashlib
 import os
 import unittest
-from tito.compat import StringIO
+from tito.compat import StringIO, encode_bytes
 from tito.tar import TarFixer
 from mock import Mock
@ -22,7 +22,8 @@ class TarTest(unittest.TestCase):
        self.out = None
    def hash_file(self, filename):
-        return self.hash_buffer(open(filename, 'rb').read())
+        file_bytes = open(filename, 'rb').read()
        return self.hash_buffer(file_bytes)
    def hash_buffer(self, buf):
        hasher = hashlib.sha256()
@ -61,10 +62,15 @@ class TarTest(unittest.TestCase):
        self.assertRaises(IOError, self.tarfixer.full_read, 10)
    def test_fix(self):
-        self.fh = open(self.test_file)
+        self.fh = open(self.test_file, 'rb')
        self.tarfixer.fh = self.fh
        self.tarfixer.fix()
-        self.assertEqual(self.reference_hash, self.hash_buffer("".join(self.out.buflist)))
+        self.assertEqual(self.reference_hash, self.hash_buffer(encode_bytes(self.out.getvalue(), "utf8")))
    def test_fix_fails_unless_file_in_binary_mode(self):
        self.fh = open(self.test_file, 'r')
        self.tarfixer.fh = self.fh
        self.assertRaises(IOError, self.tarfixer.fix)
    def test_padded_size_length_small(self):
        length = 10
@ -81,9 +87,14 @@ class TarTest(unittest.TestCase):
        block_size = 512
        self.assertEqual(1024, self.tarfixer.padded_size(length, block_size))
    def test_padded_size_length_long(self):
        length = 82607
        block_size = 512
        self.assertEqual(82944, self.tarfixer.padded_size(length, block_size))
    def test_create_extended_header(self):
        self.tarfixer.create_extended_header()
-        header = "".join(self.out.buflist)
+        header = self.out.getvalue()
        self.assertEqual(512, len(header))
        self.assertEqual("52 comment=%s\n" % EXPECTED_REF, header[:52])
        self.assertEqual("\x00" * (512 - 53), header[53:])
@ -95,7 +106,7 @@ class TarTest(unittest.TestCase):
            'c': '\x03',
            'd': '\x04',
        }
-        self.tarfixer.struct_members = fields.keys() + ['checksum']
+        self.tarfixer.struct_members = list(fields.keys()) + ['checksum']
        result = self.tarfixer.calculate_checksum(fields)
        expected_result = 10 + ord(" ") * 8
        self.assertEqual("%07o\x00" % expected_result, result)
@ -107,5 +118,6 @@ class TarTest(unittest.TestCase):
            'name': 'hello',
        }
        result = self.tarfixer.encode_header(chunk, ['mode', 'name'])
-        expected_result = ["%07o\x00" % mode, 'hello']
+        expected_result = ["%07o\x00" % mode, "hello"]
-        self.assertEqual(result, expected_result)
+        expected_result = list(map(lambda x: encode_bytes(x, "utf8"), expected_result))
        self.assertEqual(expected_result, result)