mirror of
https://github.com/rpm-software-management/tito.git
synced 2025-02-23 12:12:47 +00:00
Fix #335. Handle source tarballs with UTF8 characters in the name.
Following the principals of https://nedbatchelder.com/text/unipain.html our goal is to decode bytes in to unicode as soon as we read them and encode unicode date to bytes at the last second. The specific problem we were seeing was caused by calling "encode" on a byte string rather than a unicode string. Python attempts to be "helpful" and tries to decode the bytes as ASCII in order to provide a unicode string to the encode function. Since the bytes aren't ASCII, the decode fails and we get the UnicodeDecodeError despite the fact that we never explicitly asked for a decode at all. Also, calculate checksums correctly for tarballs that have files with UTF8 characters in the file name.
This commit is contained in:
parent
77d54e6579
commit
03509b36d5
5 changed files with 74 additions and 33 deletions
|
@ -26,26 +26,34 @@ if PY2:
|
||||||
from ConfigParser import RawConfigParser
|
from ConfigParser import RawConfigParser
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
import xmlrpclib
|
import xmlrpclib
|
||||||
|
text_type = unicode
|
||||||
|
binary_type = str
|
||||||
else:
|
else:
|
||||||
import subprocess
|
import subprocess
|
||||||
from configparser import NoOptionError
|
from configparser import NoOptionError
|
||||||
from configparser import RawConfigParser
|
from configparser import RawConfigParser
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
import xmlrpc.client as xmlrpclib
|
import xmlrpc.client as xmlrpclib
|
||||||
|
text_type = str
|
||||||
|
binary_type = bytes
|
||||||
|
|
||||||
|
|
||||||
def decode_bytes(x, source_encoding):
|
def ensure_text(x, encoding="utf8"):
|
||||||
if PY2:
|
if isinstance(x, binary_type):
|
||||||
|
return x.decode(encoding)
|
||||||
|
elif isinstance(x, text_type):
|
||||||
return x
|
return x
|
||||||
else:
|
else:
|
||||||
return x.decode(source_encoding)
|
raise TypeError("Not expecting type '%s'" % type(x))
|
||||||
|
|
||||||
|
|
||||||
def encode_bytes(x, destination_encoding):
|
def ensure_binary(x, encoding="utf8"):
|
||||||
if PY2:
|
if isinstance(x, text_type):
|
||||||
|
return x.encode(encoding)
|
||||||
|
elif isinstance(x, binary_type):
|
||||||
return x
|
return x
|
||||||
else:
|
else:
|
||||||
return bytes(x, destination_encoding)
|
raise TypeError("Not expecting type '%s'" % type(x))
|
||||||
|
|
||||||
|
|
||||||
def getstatusoutput(cmd):
|
def getstatusoutput(cmd):
|
||||||
|
|
|
@ -14,8 +14,8 @@
|
||||||
import re
|
import re
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
|
import codecs
|
||||||
from tito.compat import decode_bytes, encode_bytes
|
import tito.compat
|
||||||
|
|
||||||
RECORD_SIZE = 512
|
RECORD_SIZE = 512
|
||||||
|
|
||||||
|
@ -120,7 +120,7 @@ class TarFixer(object):
|
||||||
def full_read(self, read_size):
|
def full_read(self, read_size):
|
||||||
read = self.fh.read(read_size)
|
read = self.fh.read(read_size)
|
||||||
amount_read = len(read)
|
amount_read = len(read)
|
||||||
while (amount_read < read_size):
|
while amount_read < read_size:
|
||||||
left_to_read = read_size - amount_read
|
left_to_read = read_size - amount_read
|
||||||
next_read = self.fh.read(left_to_read)
|
next_read = self.fh.read(left_to_read)
|
||||||
|
|
||||||
|
@ -133,13 +133,7 @@ class TarFixer(object):
|
||||||
return read
|
return read
|
||||||
|
|
||||||
def write(self, data):
|
def write(self, data):
|
||||||
"""Write the data correctly depending on the mode of the file. While binary mode
|
self.out.write(tito.compat.ensure_binary(data))
|
||||||
is preferred, we support text mode for streams like stdout."""
|
|
||||||
if hasattr(self.out, 'mode') and 'b' in self.out.mode:
|
|
||||||
data = bytearray(data)
|
|
||||||
else:
|
|
||||||
data = decode_bytes(data, "utf8")
|
|
||||||
self.out.write(data)
|
|
||||||
|
|
||||||
def chunk_to_hash(self, chunk):
|
def chunk_to_hash(self, chunk):
|
||||||
# Our struct template is only 500 bytes, but the last 12 bytes are NUL
|
# Our struct template is only 500 bytes, but the last 12 bytes are NUL
|
||||||
|
@ -147,7 +141,7 @@ class TarFixer(object):
|
||||||
# template as '12x'. The unpack_from method will read the bytes our
|
# template as '12x'. The unpack_from method will read the bytes our
|
||||||
# template defines from chunk and discard the rest.
|
# template defines from chunk and discard the rest.
|
||||||
unpacked = struct.unpack_from(self.struct_template, chunk)
|
unpacked = struct.unpack_from(self.struct_template, chunk)
|
||||||
unpacked = list(map(lambda x: decode_bytes(x, 'utf8'), unpacked))
|
unpacked = list(map(lambda x: tito.compat.ensure_text(x), unpacked))
|
||||||
# Zip what we read together with the member names and create a dictionary
|
# Zip what we read together with the member names and create a dictionary
|
||||||
chunk_props = dict(zip(self.struct_members, unpacked))
|
chunk_props = dict(zip(self.struct_members, unpacked))
|
||||||
|
|
||||||
|
@ -193,9 +187,9 @@ class TarFixer(object):
|
||||||
field_size = int(re.match('(\d+)', member_template).group(1)) - 1
|
field_size = int(re.match('(\d+)', member_template).group(1)) - 1
|
||||||
fmt = "%0" + str(field_size) + "o\x00"
|
fmt = "%0" + str(field_size) + "o\x00"
|
||||||
as_string = fmt % chunk_props[member]
|
as_string = fmt % chunk_props[member]
|
||||||
pack_values.append(as_string.encode("utf8"))
|
pack_values.append(tito.compat.ensure_binary(as_string))
|
||||||
else:
|
else:
|
||||||
pack_values.append(chunk_props[member].encode("utf8"))
|
pack_values.append(tito.compat.ensure_binary(chunk_props[member]))
|
||||||
return pack_values
|
return pack_values
|
||||||
|
|
||||||
def process_header(self, chunk_props):
|
def process_header(self, chunk_props):
|
||||||
|
@ -218,10 +212,10 @@ class TarFixer(object):
|
||||||
# the size of the whole string (including the %u), the first %s is the
|
# the size of the whole string (including the %u), the first %s is the
|
||||||
# keyword, the second one is the value.
|
# keyword, the second one is the value.
|
||||||
#
|
#
|
||||||
# Since the git ref is always 40 characters we can
|
# Since the git ref is always 40 ASCII characters we can pre-compute the length
|
||||||
# pre-compute the length to put in the extended header
|
# to put in the extended header
|
||||||
comment = "52 comment=%s\n" % self.gitref
|
comment = "52 comment=%s\n" % self.gitref
|
||||||
data_out = struct.pack("=52s460x", encode_bytes(comment, "ascii"))
|
data_out = struct.pack("=52s460x", tito.compat.ensure_binary(comment, "ascii"))
|
||||||
self.write(data_out)
|
self.write(data_out)
|
||||||
self.total_length += len(data_out)
|
self.total_length += len(data_out)
|
||||||
|
|
||||||
|
@ -241,9 +235,9 @@ class TarFixer(object):
|
||||||
values = self.encode_header(chunk_props)
|
values = self.encode_header(chunk_props)
|
||||||
new_chksum = 0
|
new_chksum = 0
|
||||||
for val in values:
|
for val in values:
|
||||||
val_bytes = val.decode("utf8")
|
val_bytes = bytearray(tito.compat.ensure_binary(val))
|
||||||
for b in val_bytes:
|
for b in val_bytes:
|
||||||
new_chksum += ord(b)
|
new_chksum += b
|
||||||
return "%07o\x00" % new_chksum
|
return "%07o\x00" % new_chksum
|
||||||
|
|
||||||
def process_chunk(self, chunk):
|
def process_chunk(self, chunk):
|
||||||
|
@ -336,8 +330,8 @@ class TarFixer(object):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if len(sys.argv) != 4:
|
if len(sys.argv) != 5:
|
||||||
sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE" % sys.argv[0])
|
sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE DESTINATION_FILE" % sys.argv[0])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
timestamp = int(sys.argv[1])
|
timestamp = int(sys.argv[1])
|
||||||
|
@ -346,11 +340,17 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
gitref = sys.argv[2]
|
gitref = sys.argv[2]
|
||||||
tar_file = sys.argv[3]
|
tar_file = sys.argv[3]
|
||||||
|
destination_file = sys.argv[4]
|
||||||
|
|
||||||
|
try:
|
||||||
|
dfh = open(destination_file, 'wb')
|
||||||
|
except:
|
||||||
|
print("Could not open %s" % destination_file)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
fh = open(tar_file, 'rb')
|
fh = open(tar_file, 'rb')
|
||||||
except:
|
except:
|
||||||
print("Could not read %s" % tar_file)
|
print("Could not read %s" % tar_file)
|
||||||
|
|
||||||
reader = TarFixer(fh, sys.stdout, timestamp, gitref)
|
reader = TarFixer(fh, dfh, timestamp, gitref)
|
||||||
reader.fix()
|
reader.fix()
|
||||||
|
|
BIN
test/unit/resources/archivé.tar
Normal file
BIN
test/unit/resources/archivé.tar
Normal file
Binary file not shown.
BIN
test/unit/resources/les_misérables.tar
Normal file
BIN
test/unit/resources/les_misérables.tar
Normal file
Binary file not shown.
|
@ -1,8 +1,11 @@
|
||||||
|
# coding=utf-8
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import os
|
||||||
|
import tarfile
|
||||||
import unittest
|
import unittest
|
||||||
|
import io
|
||||||
|
|
||||||
from tito.compat import StringIO, encode_bytes
|
from tito.compat import StringIO, ensure_binary
|
||||||
from tito.tar import TarFixer
|
from tito.tar import TarFixer
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
|
|
||||||
|
@ -12,8 +15,10 @@ EXPECTED_REF = "3518d720bff20db887b7a5e5dddd411d14dca1f9"
|
||||||
|
|
||||||
class TarTest(unittest.TestCase):
|
class TarTest(unittest.TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.out = StringIO()
|
self.out = io.BytesIO()
|
||||||
self.tarfixer = TarFixer(None, self.out, EXPECTED_TIMESTAMP, EXPECTED_REF)
|
self.tarfixer = TarFixer(None, self.out, EXPECTED_TIMESTAMP, EXPECTED_REF)
|
||||||
|
self.utf8_containing_file = os.path.join(os.path.dirname(__file__), 'resources', 'les_misérables.tar')
|
||||||
|
self.utf8_file = os.path.join(os.path.dirname(__file__), 'resources', 'archivé.tar')
|
||||||
self.test_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive.tar')
|
self.test_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive.tar')
|
||||||
self.reference_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive-fixed.tar')
|
self.reference_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive-fixed.tar')
|
||||||
self.reference_hash = self.hash_file(self.reference_file)
|
self.reference_hash = self.hash_file(self.reference_file)
|
||||||
|
@ -65,7 +70,7 @@ class TarTest(unittest.TestCase):
|
||||||
self.fh = open(self.test_file, 'rb')
|
self.fh = open(self.test_file, 'rb')
|
||||||
self.tarfixer.fh = self.fh
|
self.tarfixer.fh = self.fh
|
||||||
self.tarfixer.fix()
|
self.tarfixer.fix()
|
||||||
self.assertEqual(self.reference_hash, self.hash_buffer(encode_bytes(self.out.getvalue(), "utf8")))
|
self.assertEqual(self.reference_hash, self.hash_buffer(self.out.getvalue()))
|
||||||
|
|
||||||
def test_fix_fails_unless_file_in_binary_mode(self):
|
def test_fix_fails_unless_file_in_binary_mode(self):
|
||||||
self.fh = open(self.test_file, 'r')
|
self.fh = open(self.test_file, 'r')
|
||||||
|
@ -96,8 +101,8 @@ class TarTest(unittest.TestCase):
|
||||||
self.tarfixer.create_extended_header()
|
self.tarfixer.create_extended_header()
|
||||||
header = self.out.getvalue()
|
header = self.out.getvalue()
|
||||||
self.assertEqual(512, len(header))
|
self.assertEqual(512, len(header))
|
||||||
self.assertEqual("52 comment=%s\n" % EXPECTED_REF, header[:52])
|
self.assertEqual(ensure_binary("52 comment=%s\n" % EXPECTED_REF), header[:52])
|
||||||
self.assertEqual("\x00" * (512 - 53), header[53:])
|
self.assertEqual(ensure_binary("\x00" * (512 - 53)), header[53:])
|
||||||
|
|
||||||
def test_calculate_checksum(self):
|
def test_calculate_checksum(self):
|
||||||
fields = {
|
fields = {
|
||||||
|
@ -119,5 +124,33 @@ class TarTest(unittest.TestCase):
|
||||||
}
|
}
|
||||||
result = self.tarfixer.encode_header(chunk, ['mode', 'name'])
|
result = self.tarfixer.encode_header(chunk, ['mode', 'name'])
|
||||||
expected_result = ["%07o\x00" % mode, "hello"]
|
expected_result = ["%07o\x00" % mode, "hello"]
|
||||||
expected_result = list(map(lambda x: encode_bytes(x, "utf8"), expected_result))
|
expected_result = list(map(lambda x: ensure_binary(x), expected_result))
|
||||||
self.assertEqual(expected_result, result)
|
self.assertEqual(expected_result, result)
|
||||||
|
|
||||||
|
def test_utf8_file(self):
|
||||||
|
# The goal of this test is to *not* throw a UnicodeDecodeError
|
||||||
|
self.fh = open(self.utf8_file, 'rb')
|
||||||
|
self.tarfixer.fh = self.fh
|
||||||
|
self.tarfixer.fix()
|
||||||
|
|
||||||
|
self.assertEqual(self.reference_hash, self.hash_buffer(self.out.getvalue()))
|
||||||
|
|
||||||
|
# rewind the buffer
|
||||||
|
self.out.seek(0)
|
||||||
|
try:
|
||||||
|
tarball = tarfile.open(fileobj=self.out, mode="r")
|
||||||
|
except tarfile.TarError:
|
||||||
|
self.fail("Unable to open generated tarball")
|
||||||
|
|
||||||
|
def test_utf8_containing_file(self):
|
||||||
|
# # The goal of this test is to *not* blow up due to a corrupted tarball
|
||||||
|
self.fh = open(self.utf8_containing_file, 'rb')
|
||||||
|
self.tarfixer.fh = self.fh
|
||||||
|
self.tarfixer.fix()
|
||||||
|
|
||||||
|
# rewind the buffer
|
||||||
|
self.out.seek(0)
|
||||||
|
try:
|
||||||
|
tarball = tarfile.open(fileobj=self.out, mode="r")
|
||||||
|
except tarfile.TarError as e:
|
||||||
|
self.fail("Unable to open generated tarball: %s" % e)
|
||||||
|
|
Loading…
Add table
Reference in a new issue