Fix #335. Handle source tarballs with UTF8 characters in the name.

Following the principals of https://nedbatchelder.com/text/unipain.html
our goal is to decode bytes in to unicode as soon as we read them and
encode unicode date to bytes at the last second.

The specific problem we were seeing was caused by calling "encode" on a
byte string rather than a unicode string.  Python attempts to be
"helpful" and tries to decode the bytes as ASCII in order to provide a
unicode string to the encode function.  Since the bytes aren't ASCII,
the decode fails and we get the UnicodeDecodeError despite the fact that
we never explicitly asked for a decode at all.

Also, calculate checksums correctly for tarballs that have files with
UTF8 characters in the file name.
This commit is contained in:
Alex Wood 2019-04-09 14:30:59 -04:00
parent 77d54e6579
commit 03509b36d5
5 changed files with 74 additions and 33 deletions

View file

@ -26,26 +26,34 @@ if PY2:
from ConfigParser import RawConfigParser from ConfigParser import RawConfigParser
from StringIO import StringIO from StringIO import StringIO
import xmlrpclib import xmlrpclib
text_type = unicode
binary_type = str
else: else:
import subprocess import subprocess
from configparser import NoOptionError from configparser import NoOptionError
from configparser import RawConfigParser from configparser import RawConfigParser
from io import StringIO from io import StringIO
import xmlrpc.client as xmlrpclib import xmlrpc.client as xmlrpclib
text_type = str
binary_type = bytes
def decode_bytes(x, source_encoding): def ensure_text(x, encoding="utf8"):
if PY2: if isinstance(x, binary_type):
return x.decode(encoding)
elif isinstance(x, text_type):
return x return x
else: else:
return x.decode(source_encoding) raise TypeError("Not expecting type '%s'" % type(x))
def encode_bytes(x, destination_encoding): def ensure_binary(x, encoding="utf8"):
if PY2: if isinstance(x, text_type):
return x.encode(encoding)
elif isinstance(x, binary_type):
return x return x
else: else:
return bytes(x, destination_encoding) raise TypeError("Not expecting type '%s'" % type(x))
def getstatusoutput(cmd): def getstatusoutput(cmd):

View file

@ -14,8 +14,8 @@
import re import re
import struct import struct
import sys import sys
import codecs
from tito.compat import decode_bytes, encode_bytes import tito.compat
RECORD_SIZE = 512 RECORD_SIZE = 512
@ -120,7 +120,7 @@ class TarFixer(object):
def full_read(self, read_size): def full_read(self, read_size):
read = self.fh.read(read_size) read = self.fh.read(read_size)
amount_read = len(read) amount_read = len(read)
while (amount_read < read_size): while amount_read < read_size:
left_to_read = read_size - amount_read left_to_read = read_size - amount_read
next_read = self.fh.read(left_to_read) next_read = self.fh.read(left_to_read)
@ -133,13 +133,7 @@ class TarFixer(object):
return read return read
def write(self, data): def write(self, data):
"""Write the data correctly depending on the mode of the file. While binary mode self.out.write(tito.compat.ensure_binary(data))
is preferred, we support text mode for streams like stdout."""
if hasattr(self.out, 'mode') and 'b' in self.out.mode:
data = bytearray(data)
else:
data = decode_bytes(data, "utf8")
self.out.write(data)
def chunk_to_hash(self, chunk): def chunk_to_hash(self, chunk):
# Our struct template is only 500 bytes, but the last 12 bytes are NUL # Our struct template is only 500 bytes, but the last 12 bytes are NUL
@ -147,7 +141,7 @@ class TarFixer(object):
# template as '12x'. The unpack_from method will read the bytes our # template as '12x'. The unpack_from method will read the bytes our
# template defines from chunk and discard the rest. # template defines from chunk and discard the rest.
unpacked = struct.unpack_from(self.struct_template, chunk) unpacked = struct.unpack_from(self.struct_template, chunk)
unpacked = list(map(lambda x: decode_bytes(x, 'utf8'), unpacked)) unpacked = list(map(lambda x: tito.compat.ensure_text(x), unpacked))
# Zip what we read together with the member names and create a dictionary # Zip what we read together with the member names and create a dictionary
chunk_props = dict(zip(self.struct_members, unpacked)) chunk_props = dict(zip(self.struct_members, unpacked))
@ -193,9 +187,9 @@ class TarFixer(object):
field_size = int(re.match('(\d+)', member_template).group(1)) - 1 field_size = int(re.match('(\d+)', member_template).group(1)) - 1
fmt = "%0" + str(field_size) + "o\x00" fmt = "%0" + str(field_size) + "o\x00"
as_string = fmt % chunk_props[member] as_string = fmt % chunk_props[member]
pack_values.append(as_string.encode("utf8")) pack_values.append(tito.compat.ensure_binary(as_string))
else: else:
pack_values.append(chunk_props[member].encode("utf8")) pack_values.append(tito.compat.ensure_binary(chunk_props[member]))
return pack_values return pack_values
def process_header(self, chunk_props): def process_header(self, chunk_props):
@ -218,10 +212,10 @@ class TarFixer(object):
# the size of the whole string (including the %u), the first %s is the # the size of the whole string (including the %u), the first %s is the
# keyword, the second one is the value. # keyword, the second one is the value.
# #
# Since the git ref is always 40 characters we can # Since the git ref is always 40 ASCII characters we can pre-compute the length
# pre-compute the length to put in the extended header # to put in the extended header
comment = "52 comment=%s\n" % self.gitref comment = "52 comment=%s\n" % self.gitref
data_out = struct.pack("=52s460x", encode_bytes(comment, "ascii")) data_out = struct.pack("=52s460x", tito.compat.ensure_binary(comment, "ascii"))
self.write(data_out) self.write(data_out)
self.total_length += len(data_out) self.total_length += len(data_out)
@ -241,9 +235,9 @@ class TarFixer(object):
values = self.encode_header(chunk_props) values = self.encode_header(chunk_props)
new_chksum = 0 new_chksum = 0
for val in values: for val in values:
val_bytes = val.decode("utf8") val_bytes = bytearray(tito.compat.ensure_binary(val))
for b in val_bytes: for b in val_bytes:
new_chksum += ord(b) new_chksum += b
return "%07o\x00" % new_chksum return "%07o\x00" % new_chksum
def process_chunk(self, chunk): def process_chunk(self, chunk):
@ -336,8 +330,8 @@ class TarFixer(object):
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) != 4: if len(sys.argv) != 5:
sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE" % sys.argv[0]) sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE DESTINATION_FILE" % sys.argv[0])
try: try:
timestamp = int(sys.argv[1]) timestamp = int(sys.argv[1])
@ -346,11 +340,17 @@ if __name__ == '__main__':
gitref = sys.argv[2] gitref = sys.argv[2]
tar_file = sys.argv[3] tar_file = sys.argv[3]
destination_file = sys.argv[4]
try:
dfh = open(destination_file, 'wb')
except:
print("Could not open %s" % destination_file)
try: try:
fh = open(tar_file, 'rb') fh = open(tar_file, 'rb')
except: except:
print("Could not read %s" % tar_file) print("Could not read %s" % tar_file)
reader = TarFixer(fh, sys.stdout, timestamp, gitref) reader = TarFixer(fh, dfh, timestamp, gitref)
reader.fix() reader.fix()

Binary file not shown.

Binary file not shown.

View file

@ -1,8 +1,11 @@
# coding=utf-8
import hashlib import hashlib
import os import os
import tarfile
import unittest import unittest
import io
from tito.compat import StringIO, encode_bytes from tito.compat import StringIO, ensure_binary
from tito.tar import TarFixer from tito.tar import TarFixer
from mock import Mock from mock import Mock
@ -12,8 +15,10 @@ EXPECTED_REF = "3518d720bff20db887b7a5e5dddd411d14dca1f9"
class TarTest(unittest.TestCase): class TarTest(unittest.TestCase):
def setUp(self): def setUp(self):
self.out = StringIO() self.out = io.BytesIO()
self.tarfixer = TarFixer(None, self.out, EXPECTED_TIMESTAMP, EXPECTED_REF) self.tarfixer = TarFixer(None, self.out, EXPECTED_TIMESTAMP, EXPECTED_REF)
self.utf8_containing_file = os.path.join(os.path.dirname(__file__), 'resources', 'les_misérables.tar')
self.utf8_file = os.path.join(os.path.dirname(__file__), 'resources', 'archivé.tar')
self.test_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive.tar') self.test_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive.tar')
self.reference_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive-fixed.tar') self.reference_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive-fixed.tar')
self.reference_hash = self.hash_file(self.reference_file) self.reference_hash = self.hash_file(self.reference_file)
@ -65,7 +70,7 @@ class TarTest(unittest.TestCase):
self.fh = open(self.test_file, 'rb') self.fh = open(self.test_file, 'rb')
self.tarfixer.fh = self.fh self.tarfixer.fh = self.fh
self.tarfixer.fix() self.tarfixer.fix()
self.assertEqual(self.reference_hash, self.hash_buffer(encode_bytes(self.out.getvalue(), "utf8"))) self.assertEqual(self.reference_hash, self.hash_buffer(self.out.getvalue()))
def test_fix_fails_unless_file_in_binary_mode(self): def test_fix_fails_unless_file_in_binary_mode(self):
self.fh = open(self.test_file, 'r') self.fh = open(self.test_file, 'r')
@ -96,8 +101,8 @@ class TarTest(unittest.TestCase):
self.tarfixer.create_extended_header() self.tarfixer.create_extended_header()
header = self.out.getvalue() header = self.out.getvalue()
self.assertEqual(512, len(header)) self.assertEqual(512, len(header))
self.assertEqual("52 comment=%s\n" % EXPECTED_REF, header[:52]) self.assertEqual(ensure_binary("52 comment=%s\n" % EXPECTED_REF), header[:52])
self.assertEqual("\x00" * (512 - 53), header[53:]) self.assertEqual(ensure_binary("\x00" * (512 - 53)), header[53:])
def test_calculate_checksum(self): def test_calculate_checksum(self):
fields = { fields = {
@ -119,5 +124,33 @@ class TarTest(unittest.TestCase):
} }
result = self.tarfixer.encode_header(chunk, ['mode', 'name']) result = self.tarfixer.encode_header(chunk, ['mode', 'name'])
expected_result = ["%07o\x00" % mode, "hello"] expected_result = ["%07o\x00" % mode, "hello"]
expected_result = list(map(lambda x: encode_bytes(x, "utf8"), expected_result)) expected_result = list(map(lambda x: ensure_binary(x), expected_result))
self.assertEqual(expected_result, result) self.assertEqual(expected_result, result)
def test_utf8_file(self):
# The goal of this test is to *not* throw a UnicodeDecodeError
self.fh = open(self.utf8_file, 'rb')
self.tarfixer.fh = self.fh
self.tarfixer.fix()
self.assertEqual(self.reference_hash, self.hash_buffer(self.out.getvalue()))
# rewind the buffer
self.out.seek(0)
try:
tarball = tarfile.open(fileobj=self.out, mode="r")
except tarfile.TarError:
self.fail("Unable to open generated tarball")
def test_utf8_containing_file(self):
# # The goal of this test is to *not* blow up due to a corrupted tarball
self.fh = open(self.utf8_containing_file, 'rb')
self.tarfixer.fh = self.fh
self.tarfixer.fix()
# rewind the buffer
self.out.seek(0)
try:
tarball = tarfile.open(fileobj=self.out, mode="r")
except tarfile.TarError as e:
self.fail("Unable to open generated tarball: %s" % e)