Merge pull request #336 from awood/master

Fix UnicodeDecodeError caused by source tarballs with utf8 names.
This commit is contained in:
Devan Goodwin 2019-04-10 15:51:39 -03:00 committed by GitHub
commit 86dc621bf4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 78 additions and 37 deletions

1
requirements.txt Normal file
View file

@ -0,0 +1 @@
blessings

View file

@ -33,6 +33,9 @@ setup(
},
packages=find_packages('src'),
include_package_data=True,
install_requires=[
'blessings'
],
# non-python scripts go here
scripts=[

View file

@ -26,26 +26,34 @@ if PY2:
from ConfigParser import RawConfigParser
from StringIO import StringIO
import xmlrpclib
text_type = unicode
binary_type = str
else:
import subprocess
from configparser import NoOptionError
from configparser import RawConfigParser
from io import StringIO
import xmlrpc.client as xmlrpclib
text_type = str
binary_type = bytes
def decode_bytes(x, source_encoding):
if PY2:
def ensure_text(x, encoding="utf8"):
if isinstance(x, binary_type):
return x.decode(encoding)
elif isinstance(x, text_type):
return x
else:
return x.decode(source_encoding)
raise TypeError("Not expecting type '%s'" % type(x))
def encode_bytes(x, destination_encoding):
if PY2:
def ensure_binary(x, encoding="utf8"):
if isinstance(x, text_type):
return x.encode(encoding)
elif isinstance(x, binary_type):
return x
else:
return bytes(x, destination_encoding)
raise TypeError("Not expecting type '%s'" % type(x))
def getstatusoutput(cmd):

View file

@ -14,8 +14,8 @@
import re
import struct
import sys
from tito.compat import decode_bytes, encode_bytes
import codecs
import tito.compat
RECORD_SIZE = 512
@ -120,7 +120,7 @@ class TarFixer(object):
def full_read(self, read_size):
read = self.fh.read(read_size)
amount_read = len(read)
while (amount_read < read_size):
while amount_read < read_size:
left_to_read = read_size - amount_read
next_read = self.fh.read(left_to_read)
@ -133,13 +133,7 @@ class TarFixer(object):
return read
def write(self, data):
"""Write the data correctly depending on the mode of the file. While binary mode
is preferred, we support text mode for streams like stdout."""
if hasattr(self.out, 'mode') and 'b' in self.out.mode:
data = bytearray(data)
else:
data = decode_bytes(data, "utf8")
self.out.write(data)
self.out.write(tito.compat.ensure_binary(data))
def chunk_to_hash(self, chunk):
# Our struct template is only 500 bytes, but the last 12 bytes are NUL
@ -147,7 +141,7 @@ class TarFixer(object):
# template as '12x'. The unpack_from method will read the bytes our
# template defines from chunk and discard the rest.
unpacked = struct.unpack_from(self.struct_template, chunk)
unpacked = list(map(lambda x: decode_bytes(x, 'utf8'), unpacked))
unpacked = list(map(lambda x: tito.compat.ensure_text(x), unpacked))
# Zip what we read together with the member names and create a dictionary
chunk_props = dict(zip(self.struct_members, unpacked))
@ -193,9 +187,9 @@ class TarFixer(object):
field_size = int(re.match('(\d+)', member_template).group(1)) - 1
fmt = "%0" + str(field_size) + "o\x00"
as_string = fmt % chunk_props[member]
pack_values.append(as_string.encode("utf8"))
pack_values.append(tito.compat.ensure_binary(as_string))
else:
pack_values.append(chunk_props[member].encode("utf8"))
pack_values.append(tito.compat.ensure_binary(chunk_props[member]))
return pack_values
def process_header(self, chunk_props):
@ -218,10 +212,10 @@ class TarFixer(object):
# the size of the whole string (including the %u), the first %s is the
# keyword, the second one is the value.
#
# Since the git ref is always 40 characters we can
# pre-compute the length to put in the extended header
# Since the git ref is always 40 ASCII characters we can pre-compute the length
# to put in the extended header
comment = "52 comment=%s\n" % self.gitref
data_out = struct.pack("=52s460x", encode_bytes(comment, "ascii"))
data_out = struct.pack("=52s460x", tito.compat.ensure_binary(comment, "ascii"))
self.write(data_out)
self.total_length += len(data_out)
@ -241,9 +235,9 @@ class TarFixer(object):
values = self.encode_header(chunk_props)
new_chksum = 0
for val in values:
val_bytes = val.decode("utf8")
val_bytes = bytearray(tito.compat.ensure_binary(val))
for b in val_bytes:
new_chksum += ord(b)
new_chksum += b
return "%07o\x00" % new_chksum
def process_chunk(self, chunk):
@ -336,8 +330,8 @@ class TarFixer(object):
if __name__ == '__main__':
if len(sys.argv) != 4:
sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE" % sys.argv[0])
if len(sys.argv) != 5:
sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE DESTINATION_FILE" % sys.argv[0])
try:
timestamp = int(sys.argv[1])
@ -346,11 +340,17 @@ if __name__ == '__main__':
gitref = sys.argv[2]
tar_file = sys.argv[3]
destination_file = sys.argv[4]
try:
dfh = open(destination_file, 'wb')
except:
print("Could not open %s" % destination_file)
try:
fh = open(tar_file, 'rb')
except:
print("Could not read %s" % tar_file)
reader = TarFixer(fh, sys.stdout, timestamp, gitref)
reader = TarFixer(fh, dfh, timestamp, gitref)
reader.fix()

View file

@ -5,10 +5,8 @@ Version: 0.0.1
Release: 1%{?dist}
Summary: tito test package for the external source builder
URL: https://example.com
Group: Applications/Internet
License: GPLv2
Source0: %{name}-%{version}.tar.gz
BuildRoot: %{_tmppath}/%{name}-root-%(%{__id_u} -n)
BuildArch: noarch
%description
@ -19,10 +17,8 @@ Nobody cares.
%build
%install
rm -rf %{buildroot}
%clean
rm -rf %{buildroot}
%files
%defattr(-,root,root)

Binary file not shown.

Binary file not shown.

View file

@ -1,8 +1,11 @@
# coding=utf-8
import hashlib
import os
import tarfile
import unittest
import io
from tito.compat import StringIO, encode_bytes
from tito.compat import StringIO, ensure_binary
from tito.tar import TarFixer
from mock import Mock
@ -12,8 +15,10 @@ EXPECTED_REF = "3518d720bff20db887b7a5e5dddd411d14dca1f9"
class TarTest(unittest.TestCase):
def setUp(self):
self.out = StringIO()
self.out = io.BytesIO()
self.tarfixer = TarFixer(None, self.out, EXPECTED_TIMESTAMP, EXPECTED_REF)
self.utf8_containing_file = os.path.join(os.path.dirname(__file__), 'resources', 'les_misérables.tar')
self.utf8_file = os.path.join(os.path.dirname(__file__), 'resources', 'archivé.tar')
self.test_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive.tar')
self.reference_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive-fixed.tar')
self.reference_hash = self.hash_file(self.reference_file)
@ -65,7 +70,7 @@ class TarTest(unittest.TestCase):
self.fh = open(self.test_file, 'rb')
self.tarfixer.fh = self.fh
self.tarfixer.fix()
self.assertEqual(self.reference_hash, self.hash_buffer(encode_bytes(self.out.getvalue(), "utf8")))
self.assertEqual(self.reference_hash, self.hash_buffer(self.out.getvalue()))
def test_fix_fails_unless_file_in_binary_mode(self):
self.fh = open(self.test_file, 'r')
@ -96,8 +101,8 @@ class TarTest(unittest.TestCase):
self.tarfixer.create_extended_header()
header = self.out.getvalue()
self.assertEqual(512, len(header))
self.assertEqual("52 comment=%s\n" % EXPECTED_REF, header[:52])
self.assertEqual("\x00" * (512 - 53), header[53:])
self.assertEqual(ensure_binary("52 comment=%s\n" % EXPECTED_REF), header[:52])
self.assertEqual(ensure_binary("\x00" * (512 - 53)), header[53:])
def test_calculate_checksum(self):
fields = {
@ -119,5 +124,33 @@ class TarTest(unittest.TestCase):
}
result = self.tarfixer.encode_header(chunk, ['mode', 'name'])
expected_result = ["%07o\x00" % mode, "hello"]
expected_result = list(map(lambda x: encode_bytes(x, "utf8"), expected_result))
expected_result = list(map(lambda x: ensure_binary(x), expected_result))
self.assertEqual(expected_result, result)
def test_utf8_file(self):
# The goal of this test is to *not* throw a UnicodeDecodeError
self.fh = open(self.utf8_file, 'rb')
self.tarfixer.fh = self.fh
self.tarfixer.fix()
self.assertEqual(self.reference_hash, self.hash_buffer(self.out.getvalue()))
# rewind the buffer
self.out.seek(0)
try:
tarball = tarfile.open(fileobj=self.out, mode="r")
except tarfile.TarError:
self.fail("Unable to open generated tarball")
def test_utf8_containing_file(self):
# # The goal of this test is to *not* blow up due to a corrupted tarball
self.fh = open(self.utf8_containing_file, 'rb')
self.tarfixer.fh = self.fh
self.tarfixer.fix()
# rewind the buffer
self.out.seek(0)
try:
tarball = tarfile.open(fileobj=self.out, mode="r")
except tarfile.TarError as e:
self.fail("Unable to open generated tarball: %s" % e)