Fix #335. Handle source tarballs with UTF8 characters in the name.

Following the principals of https://nedbatchelder.com/text/unipain.html our goal is to decode bytes in to unicode as soon as we read them and encode unicode date to bytes at the last second. The specific problem we were seeing was caused by calling "encode" on a byte string rather than a unicode string. Python attempts to be "helpful" and tries to decode the bytes as ASCII in order to provide a unicode string to the encode function. Since the bytes aren't ASCII, the decode fails and we get the UnicodeDecodeError despite the fact that we never explicitly asked for a decode at all. Also, calculate checksums correctly for tarballs that have files with UTF8 characters in the file name.
2025-02-23 12:12:47 +00:00 · 2019-04-09 14:30:59 -04:00 · 2019-04-09 14:30:59 -04:00 · 03509b36d5
commit 03509b36d5
parent 77d54e6579
5 changed files with 74 additions and 33 deletions
--- a/src/tito/compat.py
+++ b/src/tito/compat.py
@ -26,26 +26,34 @@ if PY2:
    from ConfigParser import RawConfigParser
    from StringIO import StringIO
    import xmlrpclib
    text_type = unicode
    binary_type = str
 else:
    import subprocess
    from configparser import NoOptionError
    from configparser import RawConfigParser
    from io import StringIO
    import xmlrpc.client as xmlrpclib
    text_type = str
    binary_type = bytes
-def decode_bytes(x, source_encoding):
+def ensure_text(x, encoding="utf8"):
-    if PY2:
+    if isinstance(x, binary_type):
        return x.decode(encoding)
    elif isinstance(x, text_type):
        return x
    else:
-        return x.decode(source_encoding)
+        raise TypeError("Not expecting type '%s'" % type(x))
-def encode_bytes(x, destination_encoding):
+def ensure_binary(x, encoding="utf8"):
-    if PY2:
+    if isinstance(x, text_type):
        return x.encode(encoding)
    elif isinstance(x, binary_type):
        return x
    else:
-        return bytes(x, destination_encoding)
+        raise TypeError("Not expecting type '%s'" % type(x))
 def getstatusoutput(cmd):
--- a/src/tito/tar.py
+++ b/src/tito/tar.py
@ -14,8 +14,8 @@
 import re
 import struct
 import sys
-
+import codecs
-from tito.compat import decode_bytes, encode_bytes
+import tito.compat
 RECORD_SIZE = 512
@ -120,7 +120,7 @@ class TarFixer(object):
    def full_read(self, read_size):
        read = self.fh.read(read_size)
        amount_read = len(read)
-        while (amount_read < read_size):
+        while amount_read < read_size:
            left_to_read = read_size - amount_read
            next_read = self.fh.read(left_to_read)
@ -133,13 +133,7 @@ class TarFixer(object):
        return read
    def write(self, data):
-        """Write the data correctly depending on the mode of the file.  While binary mode
+        self.out.write(tito.compat.ensure_binary(data))
        is preferred, we support text mode for streams like stdout."""
        if hasattr(self.out, 'mode') and 'b' in self.out.mode:
            data = bytearray(data)
        else:
            data = decode_bytes(data, "utf8")
        self.out.write(data)
    def chunk_to_hash(self, chunk):
        # Our struct template is only 500 bytes, but the last 12 bytes are NUL
@ -147,7 +141,7 @@ class TarFixer(object):
        # template as '12x'.  The unpack_from method will read the bytes our
        # template defines from chunk and discard the rest.
        unpacked = struct.unpack_from(self.struct_template, chunk)
-        unpacked = list(map(lambda x: decode_bytes(x, 'utf8'), unpacked))
+        unpacked = list(map(lambda x: tito.compat.ensure_text(x), unpacked))
        # Zip what we read together with the member names and create a dictionary
        chunk_props = dict(zip(self.struct_members, unpacked))
@ -193,9 +187,9 @@ class TarFixer(object):
                field_size = int(re.match('(\d+)', member_template).group(1)) - 1
                fmt = "%0" + str(field_size) + "o\x00"
                as_string = fmt % chunk_props[member]
-                pack_values.append(as_string.encode("utf8"))
+                pack_values.append(tito.compat.ensure_binary(as_string))
            else:
-                pack_values.append(chunk_props[member].encode("utf8"))
+                pack_values.append(tito.compat.ensure_binary(chunk_props[member]))
        return pack_values
    def process_header(self, chunk_props):
@ -218,10 +212,10 @@ class TarFixer(object):
        # the size of the whole string (including the %u), the first %s is the
        # keyword, the second one is the value.
        #
-        # Since the git ref is always 40 characters we can
+        # Since the git ref is always 40 ASCII characters we can pre-compute the length
-        # pre-compute the length to put in the extended header
+        # to put in the extended header
        comment = "52 comment=%s\n" % self.gitref
-        data_out = struct.pack("=52s460x", encode_bytes(comment, "ascii"))
+        data_out = struct.pack("=52s460x", tito.compat.ensure_binary(comment, "ascii"))
        self.write(data_out)
        self.total_length += len(data_out)
@ -241,9 +235,9 @@ class TarFixer(object):
        values = self.encode_header(chunk_props)
        new_chksum = 0
        for val in values:
-            val_bytes = val.decode("utf8")
+            val_bytes = bytearray(tito.compat.ensure_binary(val))
            for b in val_bytes:
-                new_chksum += ord(b)
+                new_chksum += b
        return "%07o\x00" % new_chksum
    def process_chunk(self, chunk):
@ -336,8 +330,8 @@ class TarFixer(object):
 if __name__ == '__main__':
-    if len(sys.argv) != 4:
+    if len(sys.argv) != 5:
-        sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE" % sys.argv[0])
+        sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE DESTINATION_FILE" % sys.argv[0])
    try:
        timestamp = int(sys.argv[1])
@ -346,11 +340,17 @@ if __name__ == '__main__':
    gitref = sys.argv[2]
    tar_file = sys.argv[3]
    destination_file = sys.argv[4]
    try:
        dfh = open(destination_file, 'wb')
    except:
        print("Could not open %s" % destination_file)
    try:
        fh = open(tar_file, 'rb')
    except:
        print("Could not read %s" % tar_file)
-    reader = TarFixer(fh, sys.stdout, timestamp, gitref)
+    reader = TarFixer(fh, dfh, timestamp, gitref)
    reader.fix()
--- a/test/unit/resources/archivé.tar
+++ b/test/unit/resources/archivé.tar
--- a/test/unit/resources/les_misérables.tar
+++ b/test/unit/resources/les_misérables.tar
--- a/test/unit/test_tar.py
+++ b/test/unit/test_tar.py
@ -1,8 +1,11 @@
 # coding=utf-8
 import hashlib
 import os
 import tarfile
 import unittest
 import io
-from tito.compat import StringIO, encode_bytes
+from tito.compat import StringIO, ensure_binary
 from tito.tar import TarFixer
 from mock import Mock
@ -12,8 +15,10 @@ EXPECTED_REF = "3518d720bff20db887b7a5e5dddd411d14dca1f9"
 class TarTest(unittest.TestCase):
    def setUp(self):
-        self.out = StringIO()
+        self.out = io.BytesIO()
        self.tarfixer = TarFixer(None, self.out, EXPECTED_TIMESTAMP, EXPECTED_REF)
        self.utf8_containing_file = os.path.join(os.path.dirname(__file__), 'resources', 'les_misérables.tar')
        self.utf8_file = os.path.join(os.path.dirname(__file__), 'resources', 'archivé.tar')
        self.test_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive.tar')
        self.reference_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive-fixed.tar')
        self.reference_hash = self.hash_file(self.reference_file)
@ -65,7 +70,7 @@ class TarTest(unittest.TestCase):
        self.fh = open(self.test_file, 'rb')
        self.tarfixer.fh = self.fh
        self.tarfixer.fix()
-        self.assertEqual(self.reference_hash, self.hash_buffer(encode_bytes(self.out.getvalue(), "utf8")))
+        self.assertEqual(self.reference_hash, self.hash_buffer(self.out.getvalue()))
    def test_fix_fails_unless_file_in_binary_mode(self):
        self.fh = open(self.test_file, 'r')
@ -96,8 +101,8 @@ class TarTest(unittest.TestCase):
        self.tarfixer.create_extended_header()
        header = self.out.getvalue()
        self.assertEqual(512, len(header))
-        self.assertEqual("52 comment=%s\n" % EXPECTED_REF, header[:52])
+        self.assertEqual(ensure_binary("52 comment=%s\n" % EXPECTED_REF), header[:52])
-        self.assertEqual("\x00" * (512 - 53), header[53:])
+        self.assertEqual(ensure_binary("\x00" * (512 - 53)), header[53:])
    def test_calculate_checksum(self):
        fields = {
@ -119,5 +124,33 @@ class TarTest(unittest.TestCase):
        }
        result = self.tarfixer.encode_header(chunk, ['mode', 'name'])
        expected_result = ["%07o\x00" % mode, "hello"]
-        expected_result = list(map(lambda x: encode_bytes(x, "utf8"), expected_result))
+        expected_result = list(map(lambda x: ensure_binary(x), expected_result))
        self.assertEqual(expected_result, result)
    def test_utf8_file(self):
        # The goal of this test is to *not* throw a UnicodeDecodeError
        self.fh = open(self.utf8_file, 'rb')
        self.tarfixer.fh = self.fh
        self.tarfixer.fix()
        self.assertEqual(self.reference_hash, self.hash_buffer(self.out.getvalue()))
        # rewind the buffer
        self.out.seek(0)
        try:
            tarball = tarfile.open(fileobj=self.out, mode="r")
        except tarfile.TarError:
            self.fail("Unable to open generated tarball")
    def test_utf8_containing_file(self):
        # # The goal of this test is to *not* blow up due to a corrupted tarball
        self.fh = open(self.utf8_containing_file, 'rb')
        self.tarfixer.fh = self.fh
        self.tarfixer.fix()
        # rewind the buffer
        self.out.seek(0)
        try:
            tarball = tarfile.open(fileobj=self.out, mode="r")
        except tarfile.TarError as e:
            self.fail("Unable to open generated tarball: %s" % e)