Revert "Fix #335. Handle source tarballs with UTF8 characters in the name."

This reverts commit 03509b36d5.
2025-02-23 12:12:47 +00:00 · 2019-06-05 11:04:42 -04:00 · 2019-06-05 11:04:42 -04:00 · c2c4c5308f
commit c2c4c5308f
parent d6c7824177
5 changed files with 33 additions and 74 deletions
--- a/src/tito/compat.py
+++ b/src/tito/compat.py
@ -26,34 +26,26 @@ if PY2:
    from ConfigParser import RawConfigParser
    from StringIO import StringIO
    import xmlrpclib
-    text_type = unicode
-    binary_type = str
 else:
    import subprocess
    from configparser import NoOptionError
    from configparser import RawConfigParser
    from io import StringIO
    import xmlrpc.client as xmlrpclib
-    text_type = str
-    binary_type = bytes


-def ensure_text(x, encoding="utf8"):
-    if isinstance(x, binary_type):
-        return x.decode(encoding)
-    elif isinstance(x, text_type):
+def decode_bytes(x, source_encoding):
+    if PY2:
        return x
    else:
-        raise TypeError("Not expecting type '%s'" % type(x))
+        return x.decode(source_encoding)


-def ensure_binary(x, encoding="utf8"):
-    if isinstance(x, text_type):
-        return x.encode(encoding)
-    elif isinstance(x, binary_type):
+def encode_bytes(x, destination_encoding):
+    if PY2:
        return x
    else:
-        raise TypeError("Not expecting type '%s'" % type(x))
+        return bytes(x, destination_encoding)


 def getstatusoutput(cmd):
--- a/src/tito/tar.py
+++ b/src/tito/tar.py
@ -14,8 +14,8 @@
 import re
 import struct
 import sys
-import codecs
-import tito.compat
+
+from tito.compat import decode_bytes, encode_bytes

 RECORD_SIZE = 512

@ -120,7 +120,7 @@ class TarFixer(object):
    def full_read(self, read_size):
        read = self.fh.read(read_size)
        amount_read = len(read)
-        while amount_read < read_size:
+        while (amount_read < read_size):
            left_to_read = read_size - amount_read
            next_read = self.fh.read(left_to_read)

@ -133,7 +133,13 @@ class TarFixer(object):
        return read

    def write(self, data):
-        self.out.write(tito.compat.ensure_binary(data))
+        """Write the data correctly depending on the mode of the file.  While binary mode
+        is preferred, we support text mode for streams like stdout."""
+        if hasattr(self.out, 'mode') and 'b' in self.out.mode:
+            data = bytearray(data)
+        else:
+            data = decode_bytes(data, "utf8")
+        self.out.write(data)

    def chunk_to_hash(self, chunk):
        # Our struct template is only 500 bytes, but the last 12 bytes are NUL
@ -141,7 +147,7 @@ class TarFixer(object):
        # template as '12x'.  The unpack_from method will read the bytes our
        # template defines from chunk and discard the rest.
        unpacked = struct.unpack_from(self.struct_template, chunk)
-        unpacked = list(map(lambda x: tito.compat.ensure_text(x), unpacked))
+        unpacked = list(map(lambda x: decode_bytes(x, 'utf8'), unpacked))
        # Zip what we read together with the member names and create a dictionary
        chunk_props = dict(zip(self.struct_members, unpacked))

@ -187,9 +193,9 @@ class TarFixer(object):
                field_size = int(re.match('(\d+)', member_template).group(1)) - 1
                fmt = "%0" + str(field_size) + "o\x00"
                as_string = fmt % chunk_props[member]
-                pack_values.append(tito.compat.ensure_binary(as_string))
+                pack_values.append(as_string.encode("utf8"))
            else:
-                pack_values.append(tito.compat.ensure_binary(chunk_props[member]))
+                pack_values.append(chunk_props[member].encode("utf8"))
        return pack_values

    def process_header(self, chunk_props):
@ -212,10 +218,10 @@ class TarFixer(object):
        # the size of the whole string (including the %u), the first %s is the
        # keyword, the second one is the value.
        #
-        # Since the git ref is always 40 ASCII characters we can pre-compute the length
-        # to put in the extended header
+        # Since the git ref is always 40 characters we can
+        # pre-compute the length to put in the extended header
        comment = "52 comment=%s\n" % self.gitref
-        data_out = struct.pack("=52s460x", tito.compat.ensure_binary(comment, "ascii"))
+        data_out = struct.pack("=52s460x", encode_bytes(comment, "ascii"))
        self.write(data_out)
        self.total_length += len(data_out)

@ -235,9 +241,9 @@ class TarFixer(object):
        values = self.encode_header(chunk_props)
        new_chksum = 0
        for val in values:
-            val_bytes = bytearray(tito.compat.ensure_binary(val))
+            val_bytes = val.decode("utf8")
            for b in val_bytes:
-                new_chksum += b
+                new_chksum += ord(b)
        return "%07o\x00" % new_chksum

    def process_chunk(self, chunk):
@ -330,8 +336,8 @@ class TarFixer(object):


 if __name__ == '__main__':
-    if len(sys.argv) != 5:
-        sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE DESTINATION_FILE" % sys.argv[0])
+    if len(sys.argv) != 4:
+        sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE" % sys.argv[0])

    try:
        timestamp = int(sys.argv[1])
@ -340,17 +346,11 @@ if __name__ == '__main__':

    gitref = sys.argv[2]
    tar_file = sys.argv[3]
-    destination_file = sys.argv[4]
-
-    try:
-        dfh = open(destination_file, 'wb')
-    except:
-        print("Could not open %s" % destination_file)

    try:
        fh = open(tar_file, 'rb')
    except:
        print("Could not read %s" % tar_file)

-    reader = TarFixer(fh, dfh, timestamp, gitref)
+    reader = TarFixer(fh, sys.stdout, timestamp, gitref)
    reader.fix()
--- a/test/unit/resources/archivé.tar
+++ b/test/unit/resources/archivé.tar
--- a/test/unit/resources/les_misérables.tar
+++ b/test/unit/resources/les_misérables.tar
--- a/test/unit/test_tar.py
+++ b/test/unit/test_tar.py
@ -1,11 +1,8 @@
-# coding=utf-8
 import hashlib
 import os
-import tarfile
 import unittest
-import io

-from tito.compat import StringIO, ensure_binary
+from tito.compat import StringIO, encode_bytes
 from tito.tar import TarFixer
 from mock import Mock

@ -15,10 +12,8 @@ EXPECTED_REF = "3518d720bff20db887b7a5e5dddd411d14dca1f9"

 class TarTest(unittest.TestCase):
    def setUp(self):
-        self.out = io.BytesIO()
+        self.out = StringIO()
        self.tarfixer = TarFixer(None, self.out, EXPECTED_TIMESTAMP, EXPECTED_REF)
-        self.utf8_containing_file = os.path.join(os.path.dirname(__file__), 'resources', 'les_misérables.tar')
-        self.utf8_file = os.path.join(os.path.dirname(__file__), 'resources', 'archivé.tar')
        self.test_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive.tar')
        self.reference_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive-fixed.tar')
        self.reference_hash = self.hash_file(self.reference_file)
@ -70,7 +65,7 @@ class TarTest(unittest.TestCase):
        self.fh = open(self.test_file, 'rb')
        self.tarfixer.fh = self.fh
        self.tarfixer.fix()
-        self.assertEqual(self.reference_hash, self.hash_buffer(self.out.getvalue()))
+        self.assertEqual(self.reference_hash, self.hash_buffer(encode_bytes(self.out.getvalue(), "utf8")))

    def test_fix_fails_unless_file_in_binary_mode(self):
        self.fh = open(self.test_file, 'r')
@ -101,8 +96,8 @@ class TarTest(unittest.TestCase):
        self.tarfixer.create_extended_header()
        header = self.out.getvalue()
        self.assertEqual(512, len(header))
-        self.assertEqual(ensure_binary("52 comment=%s\n" % EXPECTED_REF), header[:52])
-        self.assertEqual(ensure_binary("\x00" * (512 - 53)), header[53:])
+        self.assertEqual("52 comment=%s\n" % EXPECTED_REF, header[:52])
+        self.assertEqual("\x00" * (512 - 53), header[53:])

    def test_calculate_checksum(self):
        fields = {
@ -124,33 +119,5 @@ class TarTest(unittest.TestCase):
        }
        result = self.tarfixer.encode_header(chunk, ['mode', 'name'])
        expected_result = ["%07o\x00" % mode, "hello"]
-        expected_result = list(map(lambda x: ensure_binary(x), expected_result))
+        expected_result = list(map(lambda x: encode_bytes(x, "utf8"), expected_result))
        self.assertEqual(expected_result, result)
-
-    def test_utf8_file(self):
-        # The goal of this test is to *not* throw a UnicodeDecodeError
-        self.fh = open(self.utf8_file, 'rb')
-        self.tarfixer.fh = self.fh
-        self.tarfixer.fix()
-
-        self.assertEqual(self.reference_hash, self.hash_buffer(self.out.getvalue()))
-
-        # rewind the buffer
-        self.out.seek(0)
-        try:
-            tarball = tarfile.open(fileobj=self.out, mode="r")
-        except tarfile.TarError:
-            self.fail("Unable to open generated tarball")
-
-    def test_utf8_containing_file(self):
-        # # The goal of this test is to *not* blow up due to a corrupted tarball
-        self.fh = open(self.utf8_containing_file, 'rb')
-        self.tarfixer.fh = self.fh
-        self.tarfixer.fix()
-
-        # rewind the buffer
-        self.out.seek(0)
-        try:
-            tarball = tarfile.open(fileobj=self.out, mode="r")
-        except tarfile.TarError as e:
-            self.fail("Unable to open generated tarball: %s" % e)