Merge pull request #336 from awood/master

Fix UnicodeDecodeError caused by source tarballs with utf8 names.
This commit is contained in:
Devan Goodwin 2019-04-10 15:51:39 -03:00 committed by GitHub
commit 86dc621bf4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 78 additions and 37 deletions

1
requirements.txt Normal file
View file

@ -0,0 +1 @@
blessings

View file

@ -33,6 +33,9 @@ setup(
}, },
packages=find_packages('src'), packages=find_packages('src'),
include_package_data=True, include_package_data=True,
install_requires=[
'blessings'
],
# non-python scripts go here # non-python scripts go here
scripts=[ scripts=[

View file

@ -26,26 +26,34 @@ if PY2:
from ConfigParser import RawConfigParser from ConfigParser import RawConfigParser
from StringIO import StringIO from StringIO import StringIO
import xmlrpclib import xmlrpclib
text_type = unicode
binary_type = str
else: else:
import subprocess import subprocess
from configparser import NoOptionError from configparser import NoOptionError
from configparser import RawConfigParser from configparser import RawConfigParser
from io import StringIO from io import StringIO
import xmlrpc.client as xmlrpclib import xmlrpc.client as xmlrpclib
text_type = str
binary_type = bytes
def decode_bytes(x, source_encoding): def ensure_text(x, encoding="utf8"):
if PY2: if isinstance(x, binary_type):
return x.decode(encoding)
elif isinstance(x, text_type):
return x return x
else: else:
return x.decode(source_encoding) raise TypeError("Not expecting type '%s'" % type(x))
def encode_bytes(x, destination_encoding): def ensure_binary(x, encoding="utf8"):
if PY2: if isinstance(x, text_type):
return x.encode(encoding)
elif isinstance(x, binary_type):
return x return x
else: else:
return bytes(x, destination_encoding) raise TypeError("Not expecting type '%s'" % type(x))
def getstatusoutput(cmd): def getstatusoutput(cmd):

View file

@ -14,8 +14,8 @@
import re import re
import struct import struct
import sys import sys
import codecs
from tito.compat import decode_bytes, encode_bytes import tito.compat
RECORD_SIZE = 512 RECORD_SIZE = 512
@ -120,7 +120,7 @@ class TarFixer(object):
def full_read(self, read_size): def full_read(self, read_size):
read = self.fh.read(read_size) read = self.fh.read(read_size)
amount_read = len(read) amount_read = len(read)
while (amount_read < read_size): while amount_read < read_size:
left_to_read = read_size - amount_read left_to_read = read_size - amount_read
next_read = self.fh.read(left_to_read) next_read = self.fh.read(left_to_read)
@ -133,13 +133,7 @@ class TarFixer(object):
return read return read
def write(self, data): def write(self, data):
"""Write the data correctly depending on the mode of the file. While binary mode self.out.write(tito.compat.ensure_binary(data))
is preferred, we support text mode for streams like stdout."""
if hasattr(self.out, 'mode') and 'b' in self.out.mode:
data = bytearray(data)
else:
data = decode_bytes(data, "utf8")
self.out.write(data)
def chunk_to_hash(self, chunk): def chunk_to_hash(self, chunk):
# Our struct template is only 500 bytes, but the last 12 bytes are NUL # Our struct template is only 500 bytes, but the last 12 bytes are NUL
@ -147,7 +141,7 @@ class TarFixer(object):
# template as '12x'. The unpack_from method will read the bytes our # template as '12x'. The unpack_from method will read the bytes our
# template defines from chunk and discard the rest. # template defines from chunk and discard the rest.
unpacked = struct.unpack_from(self.struct_template, chunk) unpacked = struct.unpack_from(self.struct_template, chunk)
unpacked = list(map(lambda x: decode_bytes(x, 'utf8'), unpacked)) unpacked = list(map(lambda x: tito.compat.ensure_text(x), unpacked))
# Zip what we read together with the member names and create a dictionary # Zip what we read together with the member names and create a dictionary
chunk_props = dict(zip(self.struct_members, unpacked)) chunk_props = dict(zip(self.struct_members, unpacked))
@ -193,9 +187,9 @@ class TarFixer(object):
field_size = int(re.match('(\d+)', member_template).group(1)) - 1 field_size = int(re.match('(\d+)', member_template).group(1)) - 1
fmt = "%0" + str(field_size) + "o\x00" fmt = "%0" + str(field_size) + "o\x00"
as_string = fmt % chunk_props[member] as_string = fmt % chunk_props[member]
pack_values.append(as_string.encode("utf8")) pack_values.append(tito.compat.ensure_binary(as_string))
else: else:
pack_values.append(chunk_props[member].encode("utf8")) pack_values.append(tito.compat.ensure_binary(chunk_props[member]))
return pack_values return pack_values
def process_header(self, chunk_props): def process_header(self, chunk_props):
@ -218,10 +212,10 @@ class TarFixer(object):
# the size of the whole string (including the %u), the first %s is the # the size of the whole string (including the %u), the first %s is the
# keyword, the second one is the value. # keyword, the second one is the value.
# #
# Since the git ref is always 40 characters we can # Since the git ref is always 40 ASCII characters we can pre-compute the length
# pre-compute the length to put in the extended header # to put in the extended header
comment = "52 comment=%s\n" % self.gitref comment = "52 comment=%s\n" % self.gitref
data_out = struct.pack("=52s460x", encode_bytes(comment, "ascii")) data_out = struct.pack("=52s460x", tito.compat.ensure_binary(comment, "ascii"))
self.write(data_out) self.write(data_out)
self.total_length += len(data_out) self.total_length += len(data_out)
@ -241,9 +235,9 @@ class TarFixer(object):
values = self.encode_header(chunk_props) values = self.encode_header(chunk_props)
new_chksum = 0 new_chksum = 0
for val in values: for val in values:
val_bytes = val.decode("utf8") val_bytes = bytearray(tito.compat.ensure_binary(val))
for b in val_bytes: for b in val_bytes:
new_chksum += ord(b) new_chksum += b
return "%07o\x00" % new_chksum return "%07o\x00" % new_chksum
def process_chunk(self, chunk): def process_chunk(self, chunk):
@ -336,8 +330,8 @@ class TarFixer(object):
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) != 4: if len(sys.argv) != 5:
sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE" % sys.argv[0]) sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE DESTINATION_FILE" % sys.argv[0])
try: try:
timestamp = int(sys.argv[1]) timestamp = int(sys.argv[1])
@ -346,11 +340,17 @@ if __name__ == '__main__':
gitref = sys.argv[2] gitref = sys.argv[2]
tar_file = sys.argv[3] tar_file = sys.argv[3]
destination_file = sys.argv[4]
try:
dfh = open(destination_file, 'wb')
except:
print("Could not open %s" % destination_file)
try: try:
fh = open(tar_file, 'rb') fh = open(tar_file, 'rb')
except: except:
print("Could not read %s" % tar_file) print("Could not read %s" % tar_file)
reader = TarFixer(fh, sys.stdout, timestamp, gitref) reader = TarFixer(fh, dfh, timestamp, gitref)
reader.fix() reader.fix()

View file

@ -5,10 +5,8 @@ Version: 0.0.1
Release: 1%{?dist} Release: 1%{?dist}
Summary: tito test package for the external source builder Summary: tito test package for the external source builder
URL: https://example.com URL: https://example.com
Group: Applications/Internet
License: GPLv2 License: GPLv2
Source0: %{name}-%{version}.tar.gz Source0: %{name}-%{version}.tar.gz
BuildRoot: %{_tmppath}/%{name}-root-%(%{__id_u} -n)
BuildArch: noarch BuildArch: noarch
%description %description
@ -19,10 +17,8 @@ Nobody cares.
%build %build
%install %install
rm -rf %{buildroot}
%clean %clean
rm -rf %{buildroot}
%files %files
%defattr(-,root,root) %defattr(-,root,root)

Binary file not shown.

Binary file not shown.

View file

@ -1,8 +1,11 @@
# coding=utf-8
import hashlib import hashlib
import os import os
import tarfile
import unittest import unittest
import io
from tito.compat import StringIO, encode_bytes from tito.compat import StringIO, ensure_binary
from tito.tar import TarFixer from tito.tar import TarFixer
from mock import Mock from mock import Mock
@ -12,8 +15,10 @@ EXPECTED_REF = "3518d720bff20db887b7a5e5dddd411d14dca1f9"
class TarTest(unittest.TestCase): class TarTest(unittest.TestCase):
def setUp(self): def setUp(self):
self.out = StringIO() self.out = io.BytesIO()
self.tarfixer = TarFixer(None, self.out, EXPECTED_TIMESTAMP, EXPECTED_REF) self.tarfixer = TarFixer(None, self.out, EXPECTED_TIMESTAMP, EXPECTED_REF)
self.utf8_containing_file = os.path.join(os.path.dirname(__file__), 'resources', 'les_misérables.tar')
self.utf8_file = os.path.join(os.path.dirname(__file__), 'resources', 'archivé.tar')
self.test_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive.tar') self.test_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive.tar')
self.reference_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive-fixed.tar') self.reference_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive-fixed.tar')
self.reference_hash = self.hash_file(self.reference_file) self.reference_hash = self.hash_file(self.reference_file)
@ -65,7 +70,7 @@ class TarTest(unittest.TestCase):
self.fh = open(self.test_file, 'rb') self.fh = open(self.test_file, 'rb')
self.tarfixer.fh = self.fh self.tarfixer.fh = self.fh
self.tarfixer.fix() self.tarfixer.fix()
self.assertEqual(self.reference_hash, self.hash_buffer(encode_bytes(self.out.getvalue(), "utf8"))) self.assertEqual(self.reference_hash, self.hash_buffer(self.out.getvalue()))
def test_fix_fails_unless_file_in_binary_mode(self): def test_fix_fails_unless_file_in_binary_mode(self):
self.fh = open(self.test_file, 'r') self.fh = open(self.test_file, 'r')
@ -96,8 +101,8 @@ class TarTest(unittest.TestCase):
self.tarfixer.create_extended_header() self.tarfixer.create_extended_header()
header = self.out.getvalue() header = self.out.getvalue()
self.assertEqual(512, len(header)) self.assertEqual(512, len(header))
self.assertEqual("52 comment=%s\n" % EXPECTED_REF, header[:52]) self.assertEqual(ensure_binary("52 comment=%s\n" % EXPECTED_REF), header[:52])
self.assertEqual("\x00" * (512 - 53), header[53:]) self.assertEqual(ensure_binary("\x00" * (512 - 53)), header[53:])
def test_calculate_checksum(self): def test_calculate_checksum(self):
fields = { fields = {
@ -119,5 +124,33 @@ class TarTest(unittest.TestCase):
} }
result = self.tarfixer.encode_header(chunk, ['mode', 'name']) result = self.tarfixer.encode_header(chunk, ['mode', 'name'])
expected_result = ["%07o\x00" % mode, "hello"] expected_result = ["%07o\x00" % mode, "hello"]
expected_result = list(map(lambda x: encode_bytes(x, "utf8"), expected_result)) expected_result = list(map(lambda x: ensure_binary(x), expected_result))
self.assertEqual(expected_result, result) self.assertEqual(expected_result, result)
def test_utf8_file(self):
# The goal of this test is to *not* throw a UnicodeDecodeError
self.fh = open(self.utf8_file, 'rb')
self.tarfixer.fh = self.fh
self.tarfixer.fix()
self.assertEqual(self.reference_hash, self.hash_buffer(self.out.getvalue()))
# rewind the buffer
self.out.seek(0)
try:
tarball = tarfile.open(fileobj=self.out, mode="r")
except tarfile.TarError:
self.fail("Unable to open generated tarball")
def test_utf8_containing_file(self):
# # The goal of this test is to *not* blow up due to a corrupted tarball
self.fh = open(self.utf8_containing_file, 'rb')
self.tarfixer.fh = self.fh
self.tarfixer.fix()
# rewind the buffer
self.out.seek(0)
try:
tarball = tarfile.open(fileobj=self.out, mode="r")
except tarfile.TarError as e:
self.fail("Unable to open generated tarball: %s" % e)