Revert "Fix #335. Handle source tarballs with UTF8 characters in the name."

This reverts commit 03509b36d5.
This commit is contained in:
jesus m. rodriguez 2019-06-05 11:04:42 -04:00
parent d6c7824177
commit c2c4c5308f
5 changed files with 33 additions and 74 deletions

View file

@ -26,34 +26,26 @@ if PY2:
from ConfigParser import RawConfigParser
from StringIO import StringIO
import xmlrpclib
text_type = unicode
binary_type = str
else:
import subprocess
from configparser import NoOptionError
from configparser import RawConfigParser
from io import StringIO
import xmlrpc.client as xmlrpclib
text_type = str
binary_type = bytes
def ensure_text(x, encoding="utf8"):
if isinstance(x, binary_type):
return x.decode(encoding)
elif isinstance(x, text_type):
def decode_bytes(x, source_encoding):
if PY2:
return x
else:
raise TypeError("Not expecting type '%s'" % type(x))
return x.decode(source_encoding)
def ensure_binary(x, encoding="utf8"):
if isinstance(x, text_type):
return x.encode(encoding)
elif isinstance(x, binary_type):
def encode_bytes(x, destination_encoding):
if PY2:
return x
else:
raise TypeError("Not expecting type '%s'" % type(x))
return bytes(x, destination_encoding)
def getstatusoutput(cmd):

View file

@ -14,8 +14,8 @@
import re
import struct
import sys
import codecs
import tito.compat
from tito.compat import decode_bytes, encode_bytes
RECORD_SIZE = 512
@ -120,7 +120,7 @@ class TarFixer(object):
def full_read(self, read_size):
read = self.fh.read(read_size)
amount_read = len(read)
while amount_read < read_size:
while (amount_read < read_size):
left_to_read = read_size - amount_read
next_read = self.fh.read(left_to_read)
@ -133,7 +133,13 @@ class TarFixer(object):
return read
def write(self, data):
self.out.write(tito.compat.ensure_binary(data))
"""Write the data correctly depending on the mode of the file. While binary mode
is preferred, we support text mode for streams like stdout."""
if hasattr(self.out, 'mode') and 'b' in self.out.mode:
data = bytearray(data)
else:
data = decode_bytes(data, "utf8")
self.out.write(data)
def chunk_to_hash(self, chunk):
# Our struct template is only 500 bytes, but the last 12 bytes are NUL
@ -141,7 +147,7 @@ class TarFixer(object):
# template as '12x'. The unpack_from method will read the bytes our
# template defines from chunk and discard the rest.
unpacked = struct.unpack_from(self.struct_template, chunk)
unpacked = list(map(lambda x: tito.compat.ensure_text(x), unpacked))
unpacked = list(map(lambda x: decode_bytes(x, 'utf8'), unpacked))
# Zip what we read together with the member names and create a dictionary
chunk_props = dict(zip(self.struct_members, unpacked))
@ -187,9 +193,9 @@ class TarFixer(object):
field_size = int(re.match('(\d+)', member_template).group(1)) - 1
fmt = "%0" + str(field_size) + "o\x00"
as_string = fmt % chunk_props[member]
pack_values.append(tito.compat.ensure_binary(as_string))
pack_values.append(as_string.encode("utf8"))
else:
pack_values.append(tito.compat.ensure_binary(chunk_props[member]))
pack_values.append(chunk_props[member].encode("utf8"))
return pack_values
def process_header(self, chunk_props):
@ -212,10 +218,10 @@ class TarFixer(object):
# the size of the whole string (including the %u), the first %s is the
# keyword, the second one is the value.
#
# Since the git ref is always 40 ASCII characters we can pre-compute the length
# to put in the extended header
# Since the git ref is always 40 characters we can
# pre-compute the length to put in the extended header
comment = "52 comment=%s\n" % self.gitref
data_out = struct.pack("=52s460x", tito.compat.ensure_binary(comment, "ascii"))
data_out = struct.pack("=52s460x", encode_bytes(comment, "ascii"))
self.write(data_out)
self.total_length += len(data_out)
@ -235,9 +241,9 @@ class TarFixer(object):
values = self.encode_header(chunk_props)
new_chksum = 0
for val in values:
val_bytes = bytearray(tito.compat.ensure_binary(val))
val_bytes = val.decode("utf8")
for b in val_bytes:
new_chksum += b
new_chksum += ord(b)
return "%07o\x00" % new_chksum
def process_chunk(self, chunk):
@ -330,8 +336,8 @@ class TarFixer(object):
if __name__ == '__main__':
if len(sys.argv) != 5:
sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE DESTINATION_FILE" % sys.argv[0])
if len(sys.argv) != 4:
sys.exit("Usage: %s UNIX_TIMESTAMP GIT_HASH TAR_FILE" % sys.argv[0])
try:
timestamp = int(sys.argv[1])
@ -340,17 +346,11 @@ if __name__ == '__main__':
gitref = sys.argv[2]
tar_file = sys.argv[3]
destination_file = sys.argv[4]
try:
dfh = open(destination_file, 'wb')
except:
print("Could not open %s" % destination_file)
try:
fh = open(tar_file, 'rb')
except:
print("Could not read %s" % tar_file)
reader = TarFixer(fh, dfh, timestamp, gitref)
reader = TarFixer(fh, sys.stdout, timestamp, gitref)
reader.fix()

Binary file not shown.

View file

@ -1,11 +1,8 @@
# coding=utf-8
import hashlib
import os
import tarfile
import unittest
import io
from tito.compat import StringIO, ensure_binary
from tito.compat import StringIO, encode_bytes
from tito.tar import TarFixer
from mock import Mock
@ -15,10 +12,8 @@ EXPECTED_REF = "3518d720bff20db887b7a5e5dddd411d14dca1f9"
class TarTest(unittest.TestCase):
def setUp(self):
self.out = io.BytesIO()
self.out = StringIO()
self.tarfixer = TarFixer(None, self.out, EXPECTED_TIMESTAMP, EXPECTED_REF)
self.utf8_containing_file = os.path.join(os.path.dirname(__file__), 'resources', 'les_misérables.tar')
self.utf8_file = os.path.join(os.path.dirname(__file__), 'resources', 'archivé.tar')
self.test_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive.tar')
self.reference_file = os.path.join(os.path.dirname(__file__), 'resources', 'archive-fixed.tar')
self.reference_hash = self.hash_file(self.reference_file)
@ -70,7 +65,7 @@ class TarTest(unittest.TestCase):
self.fh = open(self.test_file, 'rb')
self.tarfixer.fh = self.fh
self.tarfixer.fix()
self.assertEqual(self.reference_hash, self.hash_buffer(self.out.getvalue()))
self.assertEqual(self.reference_hash, self.hash_buffer(encode_bytes(self.out.getvalue(), "utf8")))
def test_fix_fails_unless_file_in_binary_mode(self):
self.fh = open(self.test_file, 'r')
@ -101,8 +96,8 @@ class TarTest(unittest.TestCase):
self.tarfixer.create_extended_header()
header = self.out.getvalue()
self.assertEqual(512, len(header))
self.assertEqual(ensure_binary("52 comment=%s\n" % EXPECTED_REF), header[:52])
self.assertEqual(ensure_binary("\x00" * (512 - 53)), header[53:])
self.assertEqual("52 comment=%s\n" % EXPECTED_REF, header[:52])
self.assertEqual("\x00" * (512 - 53), header[53:])
def test_calculate_checksum(self):
fields = {
@ -124,33 +119,5 @@ class TarTest(unittest.TestCase):
}
result = self.tarfixer.encode_header(chunk, ['mode', 'name'])
expected_result = ["%07o\x00" % mode, "hello"]
expected_result = list(map(lambda x: ensure_binary(x), expected_result))
expected_result = list(map(lambda x: encode_bytes(x, "utf8"), expected_result))
self.assertEqual(expected_result, result)
def test_utf8_file(self):
# The goal of this test is to *not* throw a UnicodeDecodeError
self.fh = open(self.utf8_file, 'rb')
self.tarfixer.fh = self.fh
self.tarfixer.fix()
self.assertEqual(self.reference_hash, self.hash_buffer(self.out.getvalue()))
# rewind the buffer
self.out.seek(0)
try:
tarball = tarfile.open(fileobj=self.out, mode="r")
except tarfile.TarError:
self.fail("Unable to open generated tarball")
def test_utf8_containing_file(self):
# # The goal of this test is to *not* blow up due to a corrupted tarball
self.fh = open(self.utf8_containing_file, 'rb')
self.tarfixer.fh = self.fh
self.tarfixer.fix()
# rewind the buffer
self.out.seek(0)
try:
tarball = tarfile.open(fileobj=self.out, mode="r")
except tarfile.TarError as e:
self.fail("Unable to open generated tarball: %s" % e)