diff --git a/src/3rdparty/libdeflate/NOTE b/src/3rdparty/libdeflate/NOTE index de5ff072d..a25446a0d 100644 --- a/src/3rdparty/libdeflate/NOTE +++ b/src/3rdparty/libdeflate/NOTE @@ -1,2 +1,2 @@ -This is Git checkout 3cc3608e9c340e4996dff3d0633acf2ec537e12a +This is Git checkout b01537448e8eaf0803e38bdba5acef1d1c8effba from https://github.com/ebiggers/libdeflate that has not been modified. diff --git a/src/3rdparty/libdeflate/common/compiler_gcc.h b/src/3rdparty/libdeflate/common/compiler_gcc.h index 2a45b05f3..5f8811770 100644 --- a/src/3rdparty/libdeflate/common/compiler_gcc.h +++ b/src/3rdparty/libdeflate/common/compiler_gcc.h @@ -122,15 +122,30 @@ # endif /* - * Determine whether CRC32 intrinsics are supported. + * Determine whether ARM CRC32 intrinsics are supported. * - * With gcc r274827 or later (gcc 10.1+, 9.3+, or 8.4+), or with clang, - * they work as expected. (Well, not quite. There's still a bug, but we - * have to work around it later when including arm_acle.h.) + * This support has been affected by several gcc bugs, which we must avoid + * by only allowing gcc versions that have the corresponding fixes. First, + * gcc commit 943766d37ae4 ("[arm] Fix use of CRC32 intrinsics with Armv8-a + * and hard-float"), i.e. gcc 8.4+, 9.3+, 10.1+, or 11+, is needed. + * Second, gcc commit c1cdabe3aab8 ("arm: reorder assembler architecture + * directives [PR101723]"), i.e. gcc 9.5+, 10.4+, 11.3+, or 12+, is needed + * when binutils is 2.34 or later, due to + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104439. We use the second + * set of prerequisites, as they are stricter and we have no way to detect + * the binutils version in C source without requiring a configure script. + * + * Yet another gcc bug makes arm_acle.h sometimes not define the crc + * functions even when the corresponding builtins are available. However, + * we work around this later when including arm_acle.h. + * + * Things are a bit easier with clang -- we can just check whether the + * crc builtins are available. However, clang's arm_acle.h is broken in + * the same way as gcc's, which we work around later in the same way. */ -# if GCC_PREREQ(10, 1) || \ - (GCC_PREREQ(9, 3) && !GCC_PREREQ(10, 0)) || \ - (GCC_PREREQ(8, 4) && !GCC_PREREQ(9, 0)) || \ +# if GCC_PREREQ(11, 3) || \ + (GCC_PREREQ(10, 4) && !GCC_PREREQ(11, 0)) || \ + (GCC_PREREQ(9, 5) && !GCC_PREREQ(10, 0)) || \ (defined(__clang__) && __has_builtin(__builtin_arm_crc32b)) # define COMPILER_SUPPORTS_CRC32_TARGET_INTRINSICS 1 # endif @@ -199,3 +214,22 @@ typedef char __v64qi __attribute__((__vector_size__(64))); #define bsr64(n) (63 - __builtin_clzll(n)) #define bsf32(n) __builtin_ctz(n) #define bsf64(n) __builtin_ctzll(n) + +#if defined(__arm__) && \ + (__ARM_ARCH >= 7 || (__ARM_ARCH == 6 && defined(__ARM_ARCH_6T2__))) +static forceinline unsigned int +rbit32(unsigned int v) +{ + __asm__("rbit %0, %1\n" : "=r" (v) : "r" (v)); + return v; +} +#define rbit32 rbit32 +#elif defined(__aarch64__) +static forceinline unsigned int +rbit32(unsigned int v) +{ + __asm__("rbit %w0, %w1\n" : "=r" (v) : "r" (v)); + return v; +} +#define rbit32 rbit32 +#endif /* __aarch64__ */ diff --git a/src/3rdparty/libdeflate/lib/arm/cpu_features.c b/src/3rdparty/libdeflate/lib/arm/cpu_features.c index 60b1be3ee..94f59b839 100644 --- a/src/3rdparty/libdeflate/lib/arm/cpu_features.c +++ b/src/3rdparty/libdeflate/lib/arm/cpu_features.c @@ -26,13 +26,8 @@ */ /* - * ARM processors don't have a standard way for unprivileged programs to detect - * processor features. But, on Linux we can read the AT_HWCAP and AT_HWCAP2 - * values from /proc/self/auxv. - * - * Ideally we'd use the C library function getauxval(), but it's not guaranteed - * to be available: it was only added to glibc in 2.16, and in Android it was - * added to API level 18 for ARM and level 21 for AArch64. + * ARM CPUs don't have a standard way for unprivileged programs to detect CPU + * features. But an OS-specific way can be used when available. */ #include "../cpu_features_common.h" /* must be included first */ @@ -40,6 +35,16 @@ #if ARM_CPU_FEATURES_ENABLED +#ifdef __linux__ +/* + * On Linux, arm32 and arm64 CPU features can be detected by reading the + * AT_HWCAP and AT_HWCAP2 values from /proc/self/auxv. + * + * Ideally we'd use the C library function getauxval(), but it's not guaranteed + * to be available: it was only added to glibc in 2.16, and in Android it was + * added to API level 18 for arm32 and level 21 for arm64. + */ + #include #include #include @@ -48,8 +53,6 @@ #define AT_HWCAP 16 #define AT_HWCAP2 26 -volatile u32 _cpu_features = 0; - static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2) { int fd; @@ -92,13 +95,7 @@ out: close(fd); } -static const struct cpu_feature arm_cpu_feature_table[] = { - {ARM_CPU_FEATURE_NEON, "neon"}, - {ARM_CPU_FEATURE_PMULL, "pmull"}, - {ARM_CPU_FEATURE_CRC32, "crc32"}, -}; - -void setup_cpu_features(void) +static u32 get_arm_cpu_features(void) { u32 features = 0; unsigned long hwcap = 0; @@ -123,6 +120,56 @@ void setup_cpu_features(void) if (hwcap & (1 << 7)) /* HWCAP_CRC32 */ features |= ARM_CPU_FEATURE_CRC32; #endif + return features; +} + +#elif defined(__APPLE__) +/* On Apple platforms, arm64 CPU features can be detected via sysctlbyname(). */ + +#include +#include + +static const struct { + const char *name; + u32 feature; +} feature_sysctls[] = { + { "hw.optional.neon", ARM_CPU_FEATURE_NEON }, + { "hw.optional.AdvSIMD", ARM_CPU_FEATURE_NEON }, + { "hw.optional.arm.FEAT_PMULL", ARM_CPU_FEATURE_PMULL }, + { "hw.optional.armv8_crc32", ARM_CPU_FEATURE_CRC32 }, +}; + +static u32 get_arm_cpu_features(void) +{ + u32 features = 0; + size_t i; + + for (i = 0; i < ARRAY_LEN(feature_sysctls); i++) { + const char *name = feature_sysctls[i].name; + u32 val = 0; + size_t valsize = sizeof(val); + + if (sysctlbyname(name, &val, &valsize, NULL, 0) == 0 && + valsize == sizeof(val) && val == 1) + features |= feature_sysctls[i].feature; + } + return features; +} +#else +#error "unhandled case" +#endif + +static const struct cpu_feature arm_cpu_feature_table[] = { + {ARM_CPU_FEATURE_NEON, "neon"}, + {ARM_CPU_FEATURE_PMULL, "pmull"}, + {ARM_CPU_FEATURE_CRC32, "crc32"}, +}; + +volatile u32 _cpu_features = 0; + +void setup_cpu_features(void) +{ + u32 features = get_arm_cpu_features(); disable_cpu_features_for_testing(&features, arm_cpu_feature_table, ARRAY_LEN(arm_cpu_feature_table)); diff --git a/src/3rdparty/libdeflate/lib/arm/cpu_features.h b/src/3rdparty/libdeflate/lib/arm/cpu_features.h index 69d723598..8f7172f25 100644 --- a/src/3rdparty/libdeflate/lib/arm/cpu_features.h +++ b/src/3rdparty/libdeflate/lib/arm/cpu_features.h @@ -8,7 +8,8 @@ #include "../lib_common.h" #if (defined(__arm__) || defined(__aarch64__)) && \ - defined(__linux__) && \ + (defined(__linux__) || \ + (defined(__aarch64__) && defined(__APPLE__))) && \ COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE && \ !defined(FREESTANDING) # define ARM_CPU_FEATURES_ENABLED 1 diff --git a/src/3rdparty/libdeflate/lib/arm/crc32_impl.h b/src/3rdparty/libdeflate/lib/arm/crc32_impl.h index 238a85a80..73cd0203d 100644 --- a/src/3rdparty/libdeflate/lib/arm/crc32_impl.h +++ b/src/3rdparty/libdeflate/lib/arm/crc32_impl.h @@ -80,13 +80,17 @@ crc32_arm(u32 remainder, const u8 *p, size_t size) size--; } - while (size >= 32) { + while (size >= 64) { remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 0))); remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 1))); remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 2))); remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 3))); - p += 32; - size -= 32; + remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 4))); + remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 5))); + remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 6))); + remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 7))); + p += 64; + size -= 64; } while (size >= 8) { diff --git a/src/3rdparty/libdeflate/lib/bt_matchfinder.h b/src/3rdparty/libdeflate/lib/bt_matchfinder.h index 88171414b..d5b2dd561 100644 --- a/src/3rdparty/libdeflate/lib/bt_matchfinder.h +++ b/src/3rdparty/libdeflate/lib/bt_matchfinder.h @@ -139,14 +139,14 @@ bt_right_child(struct bt_matchfinder *mf, s32 node) /* Advance the binary tree matchfinder by one byte, optionally recording * matches. @record_matches should be a compile-time constant. */ static forceinline struct lz_match * -bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf, - const u8 * const restrict in_base, +bt_matchfinder_advance_one_byte(struct bt_matchfinder * const mf, + const u8 * const in_base, const ptrdiff_t cur_pos, const u32 max_len, const u32 nice_len, const u32 max_search_depth, - u32 * const restrict next_hashes, - struct lz_match * restrict lz_matchptr, + u32 * const next_hashes, + struct lz_match *lz_matchptr, const bool record_matches) { const u8 *in_next = in_base + cur_pos; diff --git a/src/3rdparty/libdeflate/lib/deflate_compress.c b/src/3rdparty/libdeflate/lib/deflate_compress.c index 377b51f44..f6e90aa82 100644 --- a/src/3rdparty/libdeflate/lib/deflate_compress.c +++ b/src/3rdparty/libdeflate/lib/deflate_compress.c @@ -52,10 +52,15 @@ /* * This is the minimum block length that the compressor will use, in - * uncompressed bytes. It is also approximately the amount by which the final - * block is allowed to grow past the soft maximum length in order to avoid using - * a very short block at the end. This should be a value below which using - * shorter blocks is unlikely to be worthwhile, due to the per-block overhead. + * uncompressed bytes. This should be a value below which using shorter blocks + * is unlikely to be worthwhile, due to the per-block overhead. This value does + * not apply to the final block, which may be shorter than this (if the input is + * shorter, it will have to be), or to the final uncompressed block in a series + * of uncompressed blocks that cover more than UINT16_MAX bytes. + * + * This value is also approximately the amount by which what would otherwise be + * the second-to-last block is allowed to grow past the soft maximum length in + * order to avoid having to use a very short final block. * * Defining a fixed minimum block length is needed in order to guarantee a * reasonable upper bound on the compressed size. It's also needed because our @@ -199,6 +204,10 @@ check_buildtime_parameters(void) MIN_BLOCK_LENGTH); STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN >= MIN_BLOCK_LENGTH); +#if SUPPORT_NEAR_OPTIMAL_PARSING + STATIC_ASSERT(MIN_BLOCK_LENGTH * MAX_MATCHES_PER_POS <= + MATCH_CACHE_LENGTH); +#endif /* The definition of MAX_BLOCK_LENGTH assumes this. */ STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH); @@ -325,6 +334,11 @@ static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 }; +/* Table: precode symbol => number of extra bits */ +static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7 +}; + /* Codewords for the DEFLATE Huffman codes */ struct deflate_codewords { u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; @@ -380,17 +394,17 @@ struct deflate_sequence { */ u16 offset; - /* - * If 'length' doesn't indicate end-of-block, then this is the offset - * symbol of the match which follows the literals. - */ - u8 offset_symbol; - /* * If 'length' doesn't indicate end-of-block, then this is the length * slot of the match which follows the literals. */ u8 length_slot; + + /* + * If 'length' doesn't indicate end-of-block, then this is the offset + * slot of the match which follows the literals. + */ + u8 offset_slot; }; #if SUPPORT_NEAR_OPTIMAL_PARSING @@ -462,18 +476,20 @@ struct block_split_stats { u32 num_observations; }; +struct deflate_output_bitstream; + /* The main DEFLATE compressor structure */ struct libdeflate_compressor { /* Pointer to the compress() implementation chosen at allocation time */ - size_t (*impl)(struct libdeflate_compressor *c, const u8 *in, - size_t in_nbytes, u8 *out, size_t out_nbytes_avail); + void (*impl)(struct libdeflate_compressor *c, const u8 *in, + size_t in_nbytes, struct deflate_output_bitstream *os); /* The compression level with which this compressor was created */ unsigned compression_level; - /* Anything smaller than this we won't bother trying to compress. */ - unsigned min_size_to_compress; + /* Anything of this size or less we won't bother trying to compress. */ + size_t max_passthrough_size; /* * The maximum search depth: consider at most this many potential @@ -619,13 +635,19 @@ struct libdeflate_compressor { * performance, this should have size equal to a machine word. */ typedef machine_word_t bitbuf_t; -#define BITBUF_NBITS (8 * sizeof(bitbuf_t)) + +/* + * The capacity of the bitbuffer, in bits. This is 1 less than the real size, + * in order to avoid undefined behavior when doing bitbuf >>= bitcount & ~7. + */ +#define BITBUF_NBITS (8 * sizeof(bitbuf_t) - 1) /* * Can the specified number of bits always be added to 'bitbuf' after any - * pending bytes have been flushed? + * pending bytes have been flushed? There can be up to 7 bits remaining after a + * flush, so the count must not exceed BITBUF_NBITS after adding 'n' more bits. */ -#define CAN_BUFFER(n) ((n) <= BITBUF_NBITS - 7) +#define CAN_BUFFER(n) (7 + (n) <= BITBUF_NBITS) /* * Structure to keep track of the current state of sending bits to the @@ -636,19 +658,23 @@ struct deflate_output_bitstream { /* Bits that haven't yet been written to the output buffer */ bitbuf_t bitbuf; - /* Number of bits currently held in @bitbuf */ + /* + * Number of bits currently held in @bitbuf. This can be between 0 and + * BITBUF_NBITS in general, or between 0 and 7 after a flush. + */ unsigned bitcount; - /* Pointer to the beginning of the output buffer */ - u8 *begin; - /* * Pointer to the position in the output buffer at which the next byte * should be written */ u8 *next; - /* Pointer to just past the end of the output buffer */ + /* + * Pointer to near the end of the output buffer. 'next' will never + * exceed this. There are OUTPUT_END_PADDING bytes reserved after this + * to allow branchlessly writing a whole word at this location. + */ u8 *end; }; @@ -663,94 +689,38 @@ struct deflate_output_bitstream { */ #define OUTPUT_END_PADDING 8 -/* - * Initialize the output bitstream. 'size' must be at least OUTPUT_END_PADDING. - */ -static void -deflate_init_output(struct deflate_output_bitstream *os, - void *buffer, size_t size) -{ - os->bitbuf = 0; - os->bitcount = 0; - os->begin = buffer; - os->next = os->begin; - os->end = os->begin + size - OUTPUT_END_PADDING; -} - /* * Add some bits to the bitbuffer variable of the output bitstream. The caller - * must ensure that os->bitcount + num_bits <= BITBUF_NBITS, by calling - * deflate_flush_bits() frequently enough. + * must ensure that 'bitcount + n <= BITBUF_NBITS', by calling FLUSH_BITS() + * frequently enough. */ -static forceinline void -deflate_add_bits(struct deflate_output_bitstream *os, - bitbuf_t bits, unsigned num_bits) -{ - os->bitbuf |= bits << os->bitcount; - os->bitcount += num_bits; -} +#define ADD_BITS(bits, n) \ +do { \ + bitbuf |= (bitbuf_t)(bits) << bitcount; \ + bitcount += (n); \ + ASSERT(bitcount <= BITBUF_NBITS); \ +} while (0) /* Flush bits from the bitbuffer variable to the output buffer. */ -static forceinline void -deflate_flush_bits(struct deflate_output_bitstream *os) -{ - if (UNALIGNED_ACCESS_IS_FAST) { - /* Flush a whole word (branchlessly). */ - put_unaligned_leword(os->bitbuf, os->next); - os->bitbuf >>= os->bitcount & ~7; - os->next += MIN(os->end - os->next, os->bitcount >> 3); - os->bitcount &= 7; - } else { - /* Flush a byte at a time. */ - while (os->bitcount >= 8) { - *os->next = os->bitbuf; - if (os->next != os->end) - os->next++; - os->bitcount -= 8; - os->bitbuf >>= 8; - } - } -} - -/* - * Add bits, then flush right away. Only use this where it is difficult to - * batch up calls to deflate_add_bits(). - */ -static forceinline void -deflate_write_bits(struct deflate_output_bitstream *os, - bitbuf_t bits, unsigned num_bits) -{ - deflate_add_bits(os, bits, num_bits); - deflate_flush_bits(os); -} - -/* Align the bitstream on a byte boundary. */ -static forceinline void -deflate_align_bitstream(struct deflate_output_bitstream *os) -{ - os->bitcount += -os->bitcount & 7; - deflate_flush_bits(os); -} - -/* - * Flush any remaining bits to the output buffer if needed. Return the total - * number of bytes that have been written to the output buffer since - * deflate_init_output(), or 0 if an overflow occurred. - */ -static size_t -deflate_flush_output(struct deflate_output_bitstream *os) -{ - if (os->next == os->end) /* overflow? */ - return 0; - - while ((int)os->bitcount > 0) { - *os->next++ = os->bitbuf; - os->bitcount -= 8; - os->bitbuf >>= 8; - } - - return os->next - os->begin; -} +#define FLUSH_BITS() \ +do { \ + if (UNALIGNED_ACCESS_IS_FAST) { \ + /* Flush a whole word (branchlessly). */ \ + put_unaligned_leword(bitbuf, out_next); \ + bitbuf >>= bitcount & ~7; \ + out_next += MIN(out_end - out_next, bitcount >> 3); \ + bitcount &= 7; \ + } else { \ + /* Flush a byte at a time. */ \ + while (bitcount >= 8) { \ + *out_next = bitbuf; \ + if (out_next != out_end) \ + out_next++; \ + bitcount -= 8; \ + bitbuf >>= 8; \ + } \ + } \ +} while (0) /* * Given the binary tree node A[subtree_idx] whose children already satisfy the @@ -816,7 +786,9 @@ heap_sort(u32 A[], unsigned length) } #define NUM_SYMBOL_BITS 10 -#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) +#define NUM_FREQ_BITS (32 - NUM_SYMBOL_BITS) +#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) +#define FREQ_MASK (~SYMBOL_MASK) #define GET_NUM_COUNTERS(num_syms) (num_syms) @@ -828,11 +800,10 @@ heap_sort(u32 A[], unsigned length) * contain the frequency. * * @num_syms - * Number of symbols in the alphabet. - * Can't be greater than (1 << NUM_SYMBOL_BITS). + * Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS. * * @freqs[num_syms] - * The frequency of each symbol. + * Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1. * * @lens[num_syms] * An array that eventually will hold the length of each codeword. This @@ -846,8 +817,7 @@ heap_sort(u32 A[], unsigned length) * number of symbols that have nonzero frequency. */ static unsigned -sort_symbols(unsigned num_syms, const u32 freqs[restrict], - u8 lens[restrict], u32 symout[restrict]) +sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[]) { unsigned sym; unsigned i; @@ -942,68 +912,59 @@ sort_symbols(unsigned num_syms, const u32 freqs[restrict], static void build_tree(u32 A[], unsigned sym_count) { - /* - * Index, in 'A', of next lowest frequency symbol that has not yet been - * processed. - */ + const unsigned last_idx = sym_count - 1; + + /* Index of the next lowest frequency leaf that still needs a parent */ unsigned i = 0; /* - * Index, in 'A', of next lowest frequency parentless non-leaf node; or, - * if equal to 'e', then no such node exists yet. + * Index of the next lowest frequency non-leaf that still needs a + * parent, or 'e' if there is currently no such node */ unsigned b = 0; - /* Index, in 'A', of next node to allocate as a non-leaf. */ + /* Index of the next spot for a non-leaf (will overwrite a leaf) */ unsigned e = 0; do { - unsigned m, n; - u32 freq_shifted; - - /* Choose the two next lowest frequency entries. */ - - if (i != sym_count && - (b == e || - (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) - m = i++; - else - m = b++; - - if (i != sym_count && - (b == e || - (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) - n = i++; - else - n = b++; + u32 new_freq; /* - * Allocate a non-leaf node and link the entries to it. + * Select the next two lowest frequency nodes among the leaves + * A[i] and non-leaves A[b], and create a new node A[e] to be + * their parent. Set the new node's frequency to the sum of the + * frequencies of its two children. * - * If we link an entry that we're visiting for the first time - * (via index 'i'), then we're actually linking a leaf node and - * it will have no effect, since the leaf will be overwritten - * with a non-leaf when index 'e' catches up to it. But it's - * not any slower to unconditionally set the parent index. - * - * We also compute the frequency of the non-leaf node as the sum - * of its two children's frequencies. + * Usually the next two lowest frequency nodes are of the same + * type (leaf or non-leaf), so check those cases first. */ - - freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK); - - A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); - A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); - A[e] = (A[e] & SYMBOL_MASK) | freq_shifted; - e++; - } while (sym_count - e > 1); + if (i + 1 <= last_idx && + (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) { + /* Two leaves */ + new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK); + i += 2; + } else if (b + 2 <= e && + (i > last_idx || + (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) { + /* Two non-leaves */ + new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK); + A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); + A[b + 1] = (e << NUM_SYMBOL_BITS) | + (A[b + 1] & SYMBOL_MASK); + b += 2; + } else { + /* One leaf and one non-leaf */ + new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK); + A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); + i++; + b++; + } + A[e] = new_freq | (A[e] & SYMBOL_MASK); /* - * When just one entry remains, it is a "leaf" that was linked - * to some other node. We ignore it, since the rest of the - * array contains the non-leaves which we need. (Note that - * we're assuming the cases with 0 or 1 symbols were handled - * separately.) + * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the + * tree is complete once we've created 'n - 1' non-leaves. */ + } while (++e < last_idx); } /* @@ -1032,8 +993,8 @@ build_tree(u32 A[], unsigned sym_count) * The maximum permissible codeword length. */ static void -compute_length_counts(u32 A[restrict], unsigned root_idx, - unsigned len_counts[restrict], unsigned max_codeword_len) +compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[], + unsigned max_codeword_len) { unsigned len; int node; @@ -1103,32 +1064,66 @@ compute_length_counts(u32 A[restrict], unsigned root_idx, } } -/* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */ -static u32 -reverse_codeword(u32 codeword, u8 len) +/* + * DEFLATE uses bit-reversed codewords, so we must bit-reverse the codewords + * after generating them. All codewords have length <= 16 bits. If the CPU has + * a bit-reversal instruction, then that is the fastest method. Otherwise the + * fastest method is to reverse the bits in each of the two bytes using a table. + * The table method is slightly faster than using bitwise operations to flip + * adjacent 1, 2, 4, and then 8-bit fields, even if 2 to 4 codewords are packed + * into a machine word and processed together using that method. + */ + +#ifdef rbit32 +static forceinline u32 reverse_codeword(u32 codeword, u8 len) +{ + return rbit32(codeword) >> ((32 - len) & 31); +} +#else +/* Generated by scripts/gen_bitreverse_tab.py */ +static const u8 bitreverse_tab[256] = { + 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, + 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, + 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, + 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, + 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, + 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, + 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, + 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, + 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, + 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, + 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, + 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa, + 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, + 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, + 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, + 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe, + 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, + 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, + 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, + 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, + 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, + 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, + 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, + 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd, + 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, + 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, + 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, + 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb, + 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, + 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, + 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, + 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff, +}; + +static forceinline u32 reverse_codeword(u32 codeword, u8 len) { - /* - * The following branchless algorithm is faster than going bit by bit. - * Note: since no codewords are longer than 16 bits, we only need to - * reverse the low 16 bits of the 'u32'. - */ STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16); - - /* Flip adjacent 1-bit fields. */ - codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1); - - /* Flip adjacent 2-bit fields. */ - codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2); - - /* Flip adjacent 4-bit fields. */ - codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4); - - /* Flip adjacent 8-bit fields. */ - codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8); - - /* Return the high 'len' bits of the bit-reversed 16 bit value. */ + codeword = ((u32)bitreverse_tab[codeword & 0xff] << 8) | + bitreverse_tab[codeword >> 8]; return codeword >> (16 - len); } +#endif /* !rbit32 */ /* * Generate the codewords for a canonical Huffman code. @@ -1154,8 +1149,7 @@ reverse_codeword(u32 codeword, u8 len) * frequency. This is the length of the 'A' and 'len' arrays. */ static void -gen_codewords(u32 A[restrict], u8 lens[restrict], - const unsigned len_counts[restrict], +gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[], unsigned max_codeword_len, unsigned num_syms) { u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1]; @@ -1208,15 +1202,15 @@ gen_codewords(u32 A[restrict], u8 lens[restrict], * @num_syms * The number of symbols in the alphabet. The symbols are the integers in * the range [0, num_syms - 1]. This parameter must be at least 2 and - * can't be greater than (1 << NUM_SYMBOL_BITS). + * must not exceed (1 << NUM_SYMBOL_BITS). * * @max_codeword_len * The maximum permissible codeword length. * * @freqs - * An array of @num_syms entries, each of which specifies the frequency of - * the corresponding symbol. It is valid for some, none, or all of the - * frequencies to be 0. + * An array of length @num_syms that gives the frequency of each symbol. + * It is valid for some, none, or all of the frequencies to be 0. The sum + * of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1. * * @lens * An array of @num_syms entries in which this function will return the @@ -1299,13 +1293,13 @@ gen_codewords(u32 A[restrict], u8 lens[restrict], */ static void deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len, - const u32 freqs[restrict], - u8 lens[restrict], u32 codewords[restrict]) + const u32 freqs[], u8 lens[], u32 codewords[]) { u32 *A = codewords; unsigned num_used_syms; STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS); + STATIC_ASSERT(MAX_BLOCK_LENGTH <= ((u32)1 << NUM_FREQ_BITS) - 1); /* * We begin by sorting the symbols primarily by frequency and @@ -1446,21 +1440,9 @@ deflate_get_offset_slot(unsigned offset) #endif } -/* Write the header fields common to all DEFLATE block types. */ -static void -deflate_write_block_header(struct deflate_output_bitstream *os, - bool is_final_block, unsigned block_type) -{ - deflate_add_bits(os, is_final_block, 1); - deflate_add_bits(os, block_type, 2); - deflate_flush_bits(os); -} - static unsigned -deflate_compute_precode_items(const u8 lens[restrict], - const unsigned num_lens, - u32 precode_freqs[restrict], - unsigned precode_items[restrict]) +deflate_compute_precode_items(const u8 lens[], const unsigned num_lens, + u32 precode_freqs[], unsigned precode_items[]) { unsigned *itemptr; unsigned run_start; @@ -1545,7 +1527,7 @@ deflate_compute_precode_items(const u8 lens[restrict], * immediately precede the compressed codeword lengths of the larger code. */ -/* Precompute the information needed to output Huffman codes. */ +/* Precompute the information needed to output dynamic Huffman codes. */ static void deflate_precompute_huffman_header(struct libdeflate_compressor *c) { @@ -1609,284 +1591,86 @@ deflate_precompute_huffman_header(struct libdeflate_compressor *c) } } -/* Output the Huffman codes. */ -static void -deflate_write_huffman_header(struct libdeflate_compressor *c, - struct deflate_output_bitstream *os) -{ - unsigned i; - - deflate_add_bits(os, c->num_litlen_syms - 257, 5); - deflate_add_bits(os, c->num_offset_syms - 1, 5); - deflate_add_bits(os, c->num_explicit_lens - 4, 4); - deflate_flush_bits(os); - - /* Output the lengths of the codewords in the precode. */ - for (i = 0; i < c->num_explicit_lens; i++) { - deflate_write_bits(os, c->precode_lens[ - deflate_precode_lens_permutation[i]], 3); - } - - /* Output the encoded lengths of the codewords in the larger code. */ - for (i = 0; i < c->num_precode_items; i++) { - unsigned precode_item = c->precode_items[i]; - unsigned precode_sym = precode_item & 0x1F; - - deflate_add_bits(os, c->precode_codewords[precode_sym], - c->precode_lens[precode_sym]); - if (precode_sym >= 16) { - if (precode_sym == 16) - deflate_add_bits(os, precode_item >> 5, 2); - else if (precode_sym == 17) - deflate_add_bits(os, precode_item >> 5, 3); - else - deflate_add_bits(os, precode_item >> 5, 7); - } - STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7)); - deflate_flush_bits(os); - } -} - -static forceinline void -deflate_write_literal_run(struct deflate_output_bitstream *os, - const u8 *in_next, u32 litrunlen, - const struct deflate_codes *codes) -{ -#if 1 - while (litrunlen >= 4) { - unsigned lit0 = in_next[0]; - unsigned lit1 = in_next[1]; - unsigned lit2 = in_next[2]; - unsigned lit3 = in_next[3]; - - deflate_add_bits(os, codes->codewords.litlen[lit0], - codes->lens.litlen[lit0]); - if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - - deflate_add_bits(os, codes->codewords.litlen[lit1], - codes->lens.litlen[lit1]); - if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - - deflate_add_bits(os, codes->codewords.litlen[lit2], - codes->lens.litlen[lit2]); - if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - - deflate_add_bits(os, codes->codewords.litlen[lit3], - codes->lens.litlen[lit3]); - deflate_flush_bits(os); - in_next += 4; - litrunlen -= 4; - } - if (litrunlen-- != 0) { - deflate_add_bits(os, codes->codewords.litlen[*in_next], - codes->lens.litlen[*in_next]); - if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - in_next++; - if (litrunlen-- != 0) { - deflate_add_bits(os, codes->codewords.litlen[*in_next], - codes->lens.litlen[*in_next]); - if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - in_next++; - if (litrunlen-- != 0) { - deflate_add_bits(os, - codes->codewords.litlen[*in_next], - codes->lens.litlen[*in_next]); - if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - in_next++; - } - } - if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - } -#else - do { - unsigned lit = *in_next++; - - deflate_write_bits(os, codes->codewords.litlen[lit], - codes->lens.litlen[lit]); - } while (--litrunlen); -#endif -} - -static forceinline void -deflate_write_match(struct deflate_output_bitstream * restrict os, - unsigned length, unsigned length_slot, - unsigned offset, unsigned offset_symbol, - const struct deflate_codes * restrict codes) -{ - unsigned litlen_symbol = DEFLATE_FIRST_LEN_SYM + length_slot; - - /* Litlen symbol */ - deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], - codes->lens.litlen[litlen_symbol]); - - /* Extra length bits */ - STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_LENGTH_BITS)); - deflate_add_bits(os, length - deflate_length_slot_base[length_slot], - deflate_extra_length_bits[length_slot]); - - if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_LENGTH_BITS + - MAX_OFFSET_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_OFFSET_BITS)) - deflate_flush_bits(os); - - /* Offset symbol */ - deflate_add_bits(os, codes->codewords.offset[offset_symbol], - codes->lens.offset[offset_symbol]); - - if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_OFFSET_BITS)) - deflate_flush_bits(os); - - /* Extra offset bits */ - deflate_add_bits(os, offset - deflate_offset_slot_base[offset_symbol], - deflate_extra_offset_bits[offset_symbol]); - - deflate_flush_bits(os); -} - -static void -deflate_write_sequences(struct deflate_output_bitstream * restrict os, - const struct deflate_codes * restrict codes, - const struct deflate_sequence sequences[restrict], - const u8 * restrict in_next) -{ - const struct deflate_sequence *seq = sequences; - - for (;;) { - u32 litrunlen = seq->litrunlen_and_length & SEQ_LITRUNLEN_MASK; - unsigned length = seq->litrunlen_and_length >> SEQ_LENGTH_SHIFT; - - if (litrunlen) { - deflate_write_literal_run(os, in_next, litrunlen, - codes); - in_next += litrunlen; - } - - if (length == 0) - return; - - deflate_write_match(os, length, seq->length_slot, - seq->offset, seq->offset_symbol, codes); - - in_next += length; - seq++; - } -} - -#if SUPPORT_NEAR_OPTIMAL_PARSING -/* - * Follow the minimum-cost path in the graph of possible match/literal choices - * for the current block and write out the matches/literals using the specified - * Huffman codes. - */ -static void -deflate_write_item_list(struct deflate_output_bitstream *os, - const struct deflate_codes *codes, - struct libdeflate_compressor *c, - u32 block_length) -{ - struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; - struct deflate_optimum_node * const end_node = - &c->p.n.optimum_nodes[block_length]; - do { - unsigned length = cur_node->item & OPTIMUM_LEN_MASK; - unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; - - if (length == 1) { - /* Literal */ - deflate_write_bits(os, codes->codewords.litlen[offset], - codes->lens.litlen[offset]); - } else { - /* Match */ - deflate_write_match(os, length, - deflate_length_slot[length], - offset, - c->p.n.offset_slot_full[offset], - codes); - } - cur_node += length; - } while (cur_node != end_node); -} -#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - -/* Output the end-of-block symbol. */ -static void -deflate_write_end_of_block(struct deflate_output_bitstream *os, - const struct deflate_codes *codes) -{ - deflate_write_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK], - codes->lens.litlen[DEFLATE_END_OF_BLOCK]); -} - -static void -deflate_write_uncompressed_block(struct deflate_output_bitstream *os, - const u8 *data, u16 len, - bool is_final_block) -{ - deflate_write_block_header(os, is_final_block, - DEFLATE_BLOCKTYPE_UNCOMPRESSED); - deflate_align_bitstream(os); - - if (4 + (u32)len >= os->end - os->next) { - os->next = os->end; - return; - } - - put_unaligned_le16(len, os->next); - os->next += 2; - put_unaligned_le16(~len, os->next); - os->next += 2; - memcpy(os->next, data, len); - os->next += len; -} - -static void -deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os, - const u8 *data, size_t data_length, - bool is_final_block) -{ - do { - u16 len = MIN(data_length, UINT16_MAX); - - deflate_write_uncompressed_block(os, data, len, - is_final_block && len == data_length); - data += len; - data_length -= len; - } while (data_length != 0); -} +/* Write a match to the output buffer. */ +#define WRITE_MATCH(codes_, length_, length_slot_, offset_, offset_slot_) \ +do { \ + const struct deflate_codes *codes__ = (codes_); \ + unsigned length__ = (length_); \ + unsigned length_slot__ = (length_slot_); \ + unsigned offset__ = (offset_); \ + unsigned offset_slot__ = (offset_slot_); \ + unsigned litlen_symbol__ = DEFLATE_FIRST_LEN_SYM + length_slot__; \ + \ + /* Litlen symbol */ \ + ADD_BITS(codes__->codewords.litlen[litlen_symbol__], \ + codes__->lens.litlen[litlen_symbol__]); \ + \ + /* Extra length bits */ \ + STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_LENGTH_BITS)); \ + ADD_BITS(length__ - deflate_length_slot_base[length_slot__], \ + deflate_extra_length_bits[length_slot__]); \ + \ + if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_LENGTH_BITS + \ + MAX_OFFSET_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_OFFSET_BITS)) \ + FLUSH_BITS(); \ + \ + /* Offset symbol */ \ + ADD_BITS(codes__->codewords.offset[offset_slot__], \ + codes__->lens.offset[offset_slot__]); \ + \ + if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_OFFSET_BITS)) \ + FLUSH_BITS(); \ + \ + /* Extra offset bits */ \ + ADD_BITS(offset__ - deflate_offset_slot_base[offset_slot__], \ + deflate_extra_offset_bits[offset_slot__]); \ + \ + FLUSH_BITS(); \ +} while (0) /* * Choose the best type of block to use (dynamic Huffman, static Huffman, or * uncompressed), then output it. */ static void -deflate_flush_block(struct libdeflate_compressor * restrict c, - struct deflate_output_bitstream * restrict os, - const u8 * restrict block_begin, u32 block_length, +deflate_flush_block(struct libdeflate_compressor *c, + struct deflate_output_bitstream *os, + const u8 *block_begin, u32 block_length, const struct deflate_sequence *sequences, bool is_final_block) { - static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7, - }; - - /* Costs are measured in bits */ + /* + * It is hard to get compilers to understand that writes to 'os->next' + * don't alias 'os'. That hurts performance significantly, as + * everything in 'os' would keep getting re-loaded. ('restrict' + * *should* do the trick, but it's unreliable.) Therefore, we keep all + * the output bitstream state in local variables, and output bits using + * macros. This is similar to what the decompressor does. + */ + const u8 *in_next = block_begin; + const u8 * const in_end = block_begin + block_length; + bitbuf_t bitbuf = os->bitbuf; + unsigned bitcount = os->bitcount; + u8 *out_next = os->next; + u8 * const out_end = os->end; + /* The cost for each block type, in bits */ u32 dynamic_cost = 0; u32 static_cost = 0; u32 uncompressed_cost = 0; + u32 best_cost; struct deflate_codes *codes; - int block_type; unsigned sym; + ASSERT(block_length >= MIN_BLOCK_LENGTH || is_final_block); + ASSERT(block_length <= MAX_BLOCK_LENGTH); + ASSERT(bitcount <= 7); + ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0); + ASSERT(out_next <= out_end); + if (sequences != NULL /* !near_optimal */ || !SUPPORT_NEAR_OPTIMAL_PARSING) { /* Tally the end-of-block symbol. */ @@ -1896,8 +1680,10 @@ deflate_flush_block(struct libdeflate_compressor * restrict c, deflate_make_huffman_codes(&c->freqs, &c->codes); } /* Else, this was already done. */ - /* Account for the cost of sending dynamic Huffman codes. */ + /* Precompute the precode items and build the precode. */ deflate_precompute_huffman_header(c); + + /* Account for the cost of encoding dynamic Huffman codes. */ dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens); for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { u32 extra = deflate_extra_precode_bits[sym]; @@ -1907,14 +1693,16 @@ deflate_flush_block(struct libdeflate_compressor * restrict c, } /* Account for the cost of encoding literals. */ - for (sym = 0; sym < 256; sym++) { + for (sym = 0; sym < 144; sym++) { dynamic_cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym]; - } - for (sym = 0; sym < 144; sym++) static_cost += c->freqs.litlen[sym] * 8; - for (; sym < 256; sym++) + } + for (; sym < 256; sym++) { + dynamic_cost += c->freqs.litlen[sym] * + c->codes.lens.litlen[sym]; static_cost += c->freqs.litlen[sym] * 9; + } /* Account for the cost of encoding the end-of-block symbol. */ dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK]; @@ -1943,50 +1731,234 @@ deflate_flush_block(struct libdeflate_compressor * restrict c, } /* Compute the cost of using uncompressed blocks. */ - uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 + + uncompressed_cost += (-(bitcount + 3) & 7) + 32 + (40 * (DIV_ROUND_UP(block_length, UINT16_MAX) - 1)) + (8 * block_length); - /* Choose the cheapest block type. */ - if (dynamic_cost < MIN(static_cost, uncompressed_cost)) { - block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN; + /* Choose and output the cheapest type of block. */ + best_cost = MIN(static_cost, uncompressed_cost); + if (dynamic_cost < best_cost) { + const unsigned num_explicit_lens = c->num_explicit_lens; + const unsigned num_precode_items = c->num_precode_items; + unsigned precode_sym, precode_item; + unsigned i; + + /* Dynamic Huffman block */ + + best_cost = dynamic_cost; codes = &c->codes; - } else if (static_cost < uncompressed_cost) { - block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN; - codes = &c->static_codes; - } else { - block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED; - } + STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3)); + ADD_BITS(is_final_block, 1); + ADD_BITS(DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, 2); + ADD_BITS(c->num_litlen_syms - 257, 5); + ADD_BITS(c->num_offset_syms - 1, 5); + ADD_BITS(num_explicit_lens - 4, 4); - /* Now actually output the block. */ + /* Output the lengths of the codewords in the precode. */ + if (CAN_BUFFER(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { + /* + * A 64-bit bitbuffer is just one bit too small to hold + * the maximum number of precode lens, so to minimize + * flushes we merge one len with the previous fields. + */ + precode_sym = deflate_precode_lens_permutation[0]; + ADD_BITS(c->precode_lens[precode_sym], 3); + FLUSH_BITS(); + i = 1; /* num_explicit_lens >= 4 */ + do { + precode_sym = + deflate_precode_lens_permutation[i]; + ADD_BITS(c->precode_lens[precode_sym], 3); + } while (++i < num_explicit_lens); + FLUSH_BITS(); + } else { + FLUSH_BITS(); + i = 0; + do { + precode_sym = + deflate_precode_lens_permutation[i]; + ADD_BITS(c->precode_lens[precode_sym], 3); + FLUSH_BITS(); + } while (++i < num_explicit_lens); + } - if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { /* - * Note: the length being flushed may exceed the maximum length - * of an uncompressed block (65535 bytes). Therefore, more than - * one uncompressed block might be needed. + * Output the lengths of the codewords in the litlen and offset + * codes, encoded by the precode. */ - deflate_write_uncompressed_blocks(os, block_begin, block_length, - is_final_block); + i = 0; + do { + precode_item = c->precode_items[i]; + precode_sym = precode_item & 0x1F; + STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7)); + ADD_BITS(c->precode_codewords[precode_sym], + c->precode_lens[precode_sym]); + ADD_BITS(precode_item >> 5, + deflate_extra_precode_bits[precode_sym]); + FLUSH_BITS(); + } while (++i < num_precode_items); + } else if (static_cost < uncompressed_cost) { + /* Static Huffman block */ + codes = &c->static_codes; + ADD_BITS(is_final_block, 1); + ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2); + FLUSH_BITS(); } else { - /* Output the block header. */ - deflate_write_block_header(os, is_final_block, block_type); + /* + * Uncompressed block(s). DEFLATE limits the length of + * uncompressed blocks to UINT16_MAX bytes, so if the length of + * the "block" we're flushing is over UINT16_MAX, we actually + * output multiple blocks. + */ + do { + u8 bfinal = 0; + size_t len = UINT16_MAX; - /* Output the Huffman codes (dynamic Huffman blocks only). */ - if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) - deflate_write_huffman_header(c, os); - - /* Output the literals, matches, and end-of-block symbol. */ - #if SUPPORT_NEAR_OPTIMAL_PARSING - if (sequences == NULL) - deflate_write_item_list(os, codes, c, block_length); - else - #endif - deflate_write_sequences(os, codes, sequences, - block_begin); - deflate_write_end_of_block(os, codes); + if (in_end - in_next <= UINT16_MAX) { + bfinal = is_final_block; + len = in_end - in_next; + } + if (out_end - out_next < + (bitcount + 3 + 7) / 8 + 4 + len) { + /* Not enough output space remaining. */ + out_next = out_end; + goto out; + } + /* + * Output BFINAL (1 bit) and BTYPE (2 bits), then align + * to a byte boundary. + */ + STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0); + *out_next++ = (bfinal << bitcount) | bitbuf; + if (bitcount > 5) + *out_next++ = 0; + bitbuf = 0; + bitcount = 0; + /* Output LEN and NLEN, then the data itself. */ + put_unaligned_le16(len, out_next); + out_next += 2; + put_unaligned_le16(~len, out_next); + out_next += 2; + memcpy(out_next, in_next, len); + out_next += len; + in_next += len; + } while (in_next != in_end); + /* Done outputting uncompressed block(s) */ + goto out; } + + /* Output the literals and matches for a dynamic or static block. */ + ASSERT(bitcount <= 7); +#if SUPPORT_NEAR_OPTIMAL_PARSING + if (sequences == NULL) { + /* Output the literals and matches from the minimum-cost path */ + struct deflate_optimum_node *cur_node = + &c->p.n.optimum_nodes[0]; + struct deflate_optimum_node * const end_node = + &c->p.n.optimum_nodes[block_length]; + do { + unsigned length = cur_node->item & OPTIMUM_LEN_MASK; + unsigned offset = cur_node->item >> + OPTIMUM_OFFSET_SHIFT; + if (length == 1) { + /* Literal */ + ADD_BITS(codes->codewords.litlen[offset], + codes->lens.litlen[offset]); + FLUSH_BITS(); + } else { + /* Match */ + WRITE_MATCH(codes, length, + deflate_length_slot[length], offset, + c->p.n.offset_slot_full[offset]); + } + cur_node += length; + } while (cur_node != end_node); + } else +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + { + /* Output the literals and matches from the sequences list. */ + const struct deflate_sequence *seq; + + for (seq = sequences; ; seq++) { + u32 litrunlen = seq->litrunlen_and_length & + SEQ_LITRUNLEN_MASK; + unsigned length = seq->litrunlen_and_length >> + SEQ_LENGTH_SHIFT; + unsigned lit; + + /* Output a run of literals. */ + if (CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) { + for (; litrunlen >= 4; litrunlen -= 4) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + FLUSH_BITS(); + } + if (litrunlen-- != 0) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + if (litrunlen-- != 0) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + if (litrunlen-- != 0) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + } + } + FLUSH_BITS(); + } + } else { + while (litrunlen--) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + FLUSH_BITS(); + } + } + + if (length == 0) { /* Last sequence? */ + ASSERT(in_next == in_end); + break; + } + + /* Output a match. */ + WRITE_MATCH(codes, length, seq->length_slot, + seq->offset, seq->offset_slot); + in_next += length; + } + } + + /* Output the end-of-block symbol. */ + ASSERT(bitcount <= 7); + ADD_BITS(codes->codewords.litlen[DEFLATE_END_OF_BLOCK], + codes->lens.litlen[DEFLATE_END_OF_BLOCK]); + FLUSH_BITS(); +out: + ASSERT(bitcount <= 7); + /* + * Assert that the block cost was computed correctly, as + * libdeflate_deflate_compress_bound() relies on this via the assumption + * that uncompressed blocks will always be used when cheaper. + */ + ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == + 3 + best_cost || out_next == out_end); + + os->bitbuf = bitbuf; + os->bitcount = bitcount; + os->next = out_next; } /******************************************************************************/ @@ -2196,7 +2168,7 @@ deflate_choose_match(struct libdeflate_compressor *c, seq->litrunlen_and_length |= (u32)length << SEQ_LENGTH_SHIFT; seq->offset = offset; seq->length_slot = length_slot; - seq->offset_symbol = offset_slot; + seq->offset_slot = offset_slot; seq++; seq->litrunlen_and_length = 0; @@ -2324,17 +2296,56 @@ choose_max_block_end(const u8 *in_block_begin, const u8 *in_end, * This is the level 0 "compressor". It always outputs uncompressed blocks. */ static size_t -deflate_compress_none(struct libdeflate_compressor * restrict c, - const u8 * restrict in, size_t in_nbytes, - u8 * restrict out, size_t out_nbytes_avail) +deflate_compress_none(const u8 *in, size_t in_nbytes, + u8 *out, size_t out_nbytes_avail) { - struct deflate_output_bitstream os; + const u8 *in_next = in; + const u8 * const in_end = in + in_nbytes; + u8 *out_next = out; + u8 * const out_end = out + out_nbytes_avail; - deflate_init_output(&os, out, out_nbytes_avail); + /* + * If the input is zero-length, we still must output a block in order + * for the output to be a valid DEFLATE stream. Handle this case + * specially to avoid potentially passing NULL to memcpy() below. + */ + if (unlikely(in_nbytes == 0)) { + if (out_nbytes_avail < 5) + return 0; + /* BFINAL and BTYPE */ + *out_next++ = 1 | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1); + /* LEN and NLEN */ + put_unaligned_le32(0xFFFF0000, out_next); + return 5; + } - deflate_write_uncompressed_blocks(&os, in, in_nbytes, true); + do { + u8 bfinal = 0; + size_t len = UINT16_MAX; - return deflate_flush_output(&os); + if (in_end - in_next <= UINT16_MAX) { + bfinal = 1; + len = in_end - in_next; + } + if (out_end - out_next < 5 + len) + return 0; + /* + * Output BFINAL and BTYPE. The stream is already byte-aligned + * here, so this step always requires outputting exactly 1 byte. + */ + *out_next++ = bfinal | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1); + + /* Output LEN and NLEN, then the data itself. */ + put_unaligned_le16(len, out_next); + out_next += 2; + put_unaligned_le16(~len, out_next); + out_next += 2; + memcpy(out_next, in_next, len); + out_next += len; + in_next += len; + } while (in_next != in_end); + + return out_next - out; } /* @@ -2343,20 +2354,18 @@ deflate_compress_none(struct libdeflate_compressor * restrict c, * splitting algorithm and just uses fixed length blocks. c->max_search_depth * has no effect with this algorithm, as it is hardcoded in ht_matchfinder.h. */ -static size_t +static void deflate_compress_fastest(struct libdeflate_compressor * restrict c, - const u8 * restrict in, size_t in_nbytes, - u8 * restrict out, size_t out_nbytes_avail) + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) { const u8 *in_next = in; const u8 *in_end = in_next + in_nbytes; - struct deflate_output_bitstream os; const u8 *in_cur_base = in_next; unsigned max_len = DEFLATE_MAX_MATCH_LEN; unsigned nice_len = MIN(c->nice_match_length, max_len); u32 next_hash = 0; - deflate_init_output(&os, out, out_nbytes_avail); ht_matchfinder_init(&c->p.f.ht_mf); do { @@ -2413,31 +2422,27 @@ deflate_compress_fastest(struct libdeflate_compressor * restrict c, } while (in_next < in_max_block_end && seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]); - deflate_flush_block(c, &os, in_block_begin, + deflate_flush_block(c, os, in_block_begin, in_next - in_block_begin, c->p.f.sequences, in_next == in_end); } while (in_next != in_end); - - return deflate_flush_output(&os); } /* * This is the "greedy" DEFLATE compressor. It always chooses the longest match. */ -static size_t +static void deflate_compress_greedy(struct libdeflate_compressor * restrict c, - const u8 * restrict in, size_t in_nbytes, - u8 * restrict out, size_t out_nbytes_avail) + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) { const u8 *in_next = in; const u8 *in_end = in_next + in_nbytes; - struct deflate_output_bitstream os; const u8 *in_cur_base = in_next; unsigned max_len = DEFLATE_MAX_MATCH_LEN; unsigned nice_len = MIN(c->nice_match_length, max_len); u32 next_hashes[2] = {0, 0}; - deflate_init_output(&os, out, out_nbytes_avail); hc_matchfinder_init(&c->p.g.hc_mf); do { @@ -2496,29 +2501,24 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c, !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); - deflate_flush_block(c, &os, in_block_begin, + deflate_flush_block(c, os, in_block_begin, in_next - in_block_begin, c->p.g.sequences, in_next == in_end); } while (in_next != in_end); - - return deflate_flush_output(&os); } -static forceinline size_t +static forceinline void deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c, - const u8 * restrict in, size_t in_nbytes, - u8 * restrict out, size_t out_nbytes_avail, - bool lazy2) + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os, bool lazy2) { const u8 *in_next = in; const u8 *in_end = in_next + in_nbytes; - struct deflate_output_bitstream os; const u8 *in_cur_base = in_next; unsigned max_len = DEFLATE_MAX_MATCH_LEN; unsigned nice_len = MIN(c->nice_match_length, max_len); u32 next_hashes[2] = {0, 0}; - deflate_init_output(&os, out, out_nbytes_avail); hc_matchfinder_init(&c->p.g.hc_mf); do { @@ -2707,12 +2707,10 @@ have_cur_match: !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); - deflate_flush_block(c, &os, in_block_begin, + deflate_flush_block(c, os, in_block_begin, in_next - in_block_begin, c->p.g.sequences, in_next == in_end); } while (in_next != in_end); - - return deflate_flush_output(&os); } /* @@ -2720,13 +2718,12 @@ have_cur_match: * see if there's a better match at the next position. If yes, it outputs a * literal and continues to the next position. If no, it outputs the match. */ -static size_t +static void deflate_compress_lazy(struct libdeflate_compressor * restrict c, - const u8 * restrict in, size_t in_nbytes, - u8 * restrict out, size_t out_nbytes_avail) + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) { - return deflate_compress_lazy_generic(c, in, in_nbytes, out, - out_nbytes_avail, false); + deflate_compress_lazy_generic(c, in, in_nbytes, os, false); } /* @@ -2734,13 +2731,12 @@ deflate_compress_lazy(struct libdeflate_compressor * restrict c, * for a better match at the next 2 positions rather than the next 1. This * makes it take slightly more time, but compress some inputs slightly more. */ -static size_t +static void deflate_compress_lazy2(struct libdeflate_compressor * restrict c, - const u8 * restrict in, size_t in_nbytes, - u8 * restrict out, size_t out_nbytes_avail) + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) { - return deflate_compress_lazy_generic(c, in, in_nbytes, out, - out_nbytes_avail, true); + deflate_compress_lazy_generic(c, in, in_nbytes, os, true); } #if SUPPORT_NEAR_OPTIMAL_PARSING @@ -3352,15 +3348,14 @@ deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c) * - Symbol costs are unknown until the symbols have already been chosen * (so iterative optimization must be used) */ -static size_t +static void deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, - const u8 * restrict in, size_t in_nbytes, - u8 * restrict out, size_t out_nbytes_avail) + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) { const u8 *in_next = in; const u8 *in_block_begin = in_next; const u8 *in_end = in_next + in_nbytes; - struct deflate_output_bitstream os; const u8 *in_cur_base = in_next; const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE); @@ -3369,7 +3364,6 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, struct lz_match *cache_ptr = c->p.n.match_cache; u32 next_hashes[2] = {0, 0}; - deflate_init_output(&os, out, out_nbytes_avail); bt_matchfinder_init(&c->p.n.bt_mf); deflate_near_optimal_init_stats(c); @@ -3567,8 +3561,8 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, deflate_optimize_block(c, in_block_begin, block_length, cache_ptr, is_first, is_final); - deflate_flush_block(c, &os, in_block_begin, - block_length, NULL, is_final); + deflate_flush_block(c, os, in_block_begin, block_length, + NULL, is_final); memmove(c->p.n.match_cache, cache_ptr, cache_len_rewound * sizeof(*cache_ptr)); cache_ptr = &c->p.n.match_cache[cache_len_rewound]; @@ -3592,16 +3586,14 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, deflate_near_optimal_merge_stats(c); deflate_optimize_block(c, in_block_begin, block_length, cache_ptr, is_first, is_final); - deflate_flush_block(c, &os, in_block_begin, - block_length, NULL, is_final); + deflate_flush_block(c, os, in_block_begin, block_length, + NULL, is_final); cache_ptr = &c->p.n.match_cache[0]; deflate_near_optimal_save_stats(c); deflate_near_optimal_init_stats(c); in_block_begin = in_next; } } while (in_next != in_end); - - return deflate_flush_output(&os); } /* Initialize c->p.n.offset_slot_full. */ @@ -3658,11 +3650,12 @@ libdeflate_alloc_compressor(int compression_level) * The higher the compression level, the more we should bother trying to * compress very small inputs. */ - c->min_size_to_compress = 56 - (compression_level * 4); + c->max_passthrough_size = 55 - (compression_level * 4); switch (compression_level) { case 0: - c->impl = deflate_compress_none; + c->max_passthrough_size = SIZE_MAX; + c->impl = NULL; /* not used */ break; case 1: c->impl = deflate_compress_fastest; @@ -3748,20 +3741,42 @@ libdeflate_deflate_compress(struct libdeflate_compressor *c, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail) { - if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING)) + struct deflate_output_bitstream os; + + /* + * For extremely short inputs, or for compression level 0, just output + * uncompressed blocks. + */ + if (unlikely(in_nbytes <= c->max_passthrough_size)) + return deflate_compress_none(in, in_nbytes, + out, out_nbytes_avail); + + /* + * Initialize the output bitstream structure. + * + * The end is set to OUTPUT_END_PADDING below the true end, so that + * FLUSH_BITS() can be more efficient. + */ + if (unlikely(out_nbytes_avail <= OUTPUT_END_PADDING)) return 0; - - /* For extremely small inputs, just use a single uncompressed block. */ - if (unlikely(in_nbytes < c->min_size_to_compress)) { - struct deflate_output_bitstream os; - deflate_init_output(&os, out, out_nbytes_avail); - if (in_nbytes == 0) - in = &os; /* Avoid passing NULL to memcpy(). */ - deflate_write_uncompressed_block(&os, in, in_nbytes, true); - return deflate_flush_output(&os); - } - - return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail); + os.bitbuf = 0; + os.bitcount = 0; + os.next = out; + os.end = os.next + out_nbytes_avail - OUTPUT_END_PADDING; + (*c->impl)(c, in, in_nbytes, &os); + /* + * If 'os.next' reached 'os.end', then either there was not enough space + * in the output buffer, or the compressed size would have been within + * OUTPUT_END_PADDING of the true end. For performance reasons we don't + * distinguish between these cases; we just make sure to return some + * extra space from libdeflate_deflate_compress_bound(). + */ + if (os.next >= os.end) + return 0; + ASSERT(os.bitcount <= 7); + if (os.bitcount) + *os.next++ = os.bitbuf; + return os.next - (u8 *)out; } LIBDEFLATEEXPORT void LIBDEFLATEAPI @@ -3780,14 +3795,59 @@ LIBDEFLATEEXPORT size_t LIBDEFLATEAPI libdeflate_deflate_compress_bound(struct libdeflate_compressor *c, size_t in_nbytes) { - /* - * The worst case is all uncompressed blocks where one block has length - * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH. - * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE, - * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN. - */ - size_t max_num_blocks = - MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1); + size_t bound = 0; + size_t max_blocks; - return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING; + /* + * Since the compressor never uses a compressed block when an + * uncompressed block is cheaper, the worst case can be no worse than + * the case where only uncompressed blocks are used. + * + * This is true even though up to 7 bits are "wasted" to byte-align the + * bitstream when a compressed block is followed by an uncompressed + * block. This is because a compressed block wouldn't have been used if + * it wasn't cheaper than an uncompressed block, and uncompressed blocks + * always end on a byte boundary. So the alignment bits will, at worst, + * go up to the place where the uncompressed block would have ended. + */ + + /* + * The minimum length that is passed to deflate_flush_block() is + * MIN_BLOCK_LENGTH bytes, except for the final block if needed. + * + * If deflate_flush_block() decides to use an uncompressed block, it + * actually will (in general) output a series of uncompressed blocks in + * order to stay within the UINT16_MAX limit of DEFLATE. But this can + * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX', + * as in that case this behavior can't result in more blocks than the + * case where deflate_flush_block() is called with min-length inputs. + * + * So the number of uncompressed blocks needed would be bounded by + * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH). However, empty inputs + * need 1 (empty) block, which gives the final expression below. + */ + STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX); + max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1); + + /* + * Each uncompressed block has 5 bytes of overhead, for the BFINAL, + * BTYPE, LEN, and NLEN fields. (For the reason explained earlier, the + * alignment bits at the very start of the block can be disregarded; + * they would otherwise increase the overhead to 6 bytes per block.) + */ + bound += 5 * max_blocks; + + /* Account for the data itself, stored uncompressed. */ + bound += in_nbytes; + + /* + * Add 1 + OUTPUT_END_PADDING because for performance reasons, the + * compressor doesn't distinguish between cases where there wasn't + * enough space and cases where the compressed size would have been + * 'out_nbytes_avail - OUTPUT_END_PADDING' or greater. Adding + * 1 + OUTPUT_END_PADDING to the bound ensures the needed wiggle room. + */ + bound += 1 + OUTPUT_END_PADDING; + + return bound; } diff --git a/src/3rdparty/libdeflate/lib/deflate_decompress.c b/src/3rdparty/libdeflate/lib/deflate_decompress.c index 6138206b3..a37c26042 100644 --- a/src/3rdparty/libdeflate/lib/deflate_decompress.c +++ b/src/3rdparty/libdeflate/lib/deflate_decompress.c @@ -313,7 +313,7 @@ do { \ /* * Read a 16-bit value from the input. This must have been preceded by a call - * to ALIGN_INPUT(), and the caller must have already checked for overrun. + * to ALIGN_INPUT(), and the caller must have already checked for overread. */ #define READ_U16() (tmp16 = get_unaligned_le16(in_next), in_next += 2, tmp16) @@ -886,9 +886,8 @@ copy_word_unaligned(const void *src, void *dst) *****************************************************************************/ typedef enum libdeflate_result (*decompress_func_t) - (struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, + (struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); #undef DEFAULT_IMPL @@ -906,18 +905,16 @@ typedef enum libdeflate_result (*decompress_func_t) #ifdef DISPATCH static enum libdeflate_result -dispatch(struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, +dispatch(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); static volatile decompress_func_t decompress_impl = dispatch; /* Choose the fastest implementation at runtime */ static enum libdeflate_result -dispatch(struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, +dispatch(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) { decompress_func_t f = arch_select_decompress_func(); @@ -943,9 +940,9 @@ dispatch(struct libdeflate_decompressor * restrict d, * at runtime. */ LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_deflate_decompress_ex(struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, +libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) { @@ -954,9 +951,9 @@ libdeflate_deflate_decompress_ex(struct libdeflate_decompressor * restrict d, } LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_deflate_decompress(struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, +libdeflate_deflate_decompress(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, size_t *actual_out_nbytes_ret) { return libdeflate_deflate_decompress_ex(d, in, in_nbytes, diff --git a/src/3rdparty/libdeflate/lib/hc_matchfinder.h b/src/3rdparty/libdeflate/lib/hc_matchfinder.h index c65691332..bc22961f2 100644 --- a/src/3rdparty/libdeflate/lib/hc_matchfinder.h +++ b/src/3rdparty/libdeflate/lib/hc_matchfinder.h @@ -181,15 +181,15 @@ hc_matchfinder_slide_window(struct hc_matchfinder *mf) * 'best_len' was found. */ static forceinline u32 -hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, - const u8 ** const restrict in_base_p, - const u8 * const restrict in_next, +hc_matchfinder_longest_match(struct hc_matchfinder * const mf, + const u8 ** const in_base_p, + const u8 * const in_next, u32 best_len, const u32 max_len, const u32 nice_len, const u32 max_search_depth, - u32 * const restrict next_hashes, - u32 * const restrict offset_ret) + u32 * const next_hashes, + u32 * const offset_ret) { u32 depth_remaining = max_search_depth; const u8 *best_matchptr = in_next; @@ -359,12 +359,12 @@ out: * the sequence beginning at @in_next + @count. */ static forceinline void -hc_matchfinder_skip_bytes(struct hc_matchfinder * const restrict mf, - const u8 ** const restrict in_base_p, +hc_matchfinder_skip_bytes(struct hc_matchfinder * const mf, + const u8 ** const in_base_p, const u8 *in_next, const u8 * const in_end, const u32 count, - u32 * const restrict next_hashes) + u32 * const next_hashes) { u32 cur_pos; u32 hash3, hash4; diff --git a/src/3rdparty/libdeflate/lib/ht_matchfinder.h b/src/3rdparty/libdeflate/lib/ht_matchfinder.h index e8323c3c2..a8799c874 100644 --- a/src/3rdparty/libdeflate/lib/ht_matchfinder.h +++ b/src/3rdparty/libdeflate/lib/ht_matchfinder.h @@ -75,13 +75,13 @@ ht_matchfinder_slide_window(struct ht_matchfinder *mf) /* Note: max_len must be >= HT_MATCHFINDER_REQUIRED_NBYTES */ static forceinline u32 -ht_matchfinder_longest_match(struct ht_matchfinder * const restrict mf, - const u8 ** const restrict in_base_p, - const u8 * const restrict in_next, +ht_matchfinder_longest_match(struct ht_matchfinder * const mf, + const u8 ** const in_base_p, + const u8 * const in_next, const u32 max_len, const u32 nice_len, - u32 * const restrict next_hash, - u32 * const restrict offset_ret) + u32 * const next_hash, + u32 * const offset_ret) { u32 best_len = 0; const u8 *best_matchptr = in_next; @@ -195,12 +195,12 @@ out: } static forceinline void -ht_matchfinder_skip_bytes(struct ht_matchfinder * const restrict mf, - const u8 ** const restrict in_base_p, +ht_matchfinder_skip_bytes(struct ht_matchfinder * const mf, + const u8 ** const in_base_p, const u8 *in_next, const u8 * const in_end, const u32 count, - u32 * const restrict next_hash) + u32 * const next_hash) { s32 cur_pos = in_next - *in_base_p; u32 hash; diff --git a/src/3rdparty/libdeflate/lib/lib_common.h b/src/3rdparty/libdeflate/lib/lib_common.h index 2eea56c72..838a8c06f 100644 --- a/src/3rdparty/libdeflate/lib/lib_common.h +++ b/src/3rdparty/libdeflate/lib/lib_common.h @@ -60,8 +60,22 @@ void *memmove(void *dest, const void *src, size_t n); int memcmp(const void *s1, const void *s2, size_t n); #define memcmp(s1, s2, n) __builtin_memcmp((s1), (s2), (n)) + +#undef LIBDEFLATE_ENABLE_ASSERTIONS #else #include #endif +/* + * Runtime assertion support. Don't enable this in production builds; it may + * hurt performance significantly. + */ +#ifdef LIBDEFLATE_ENABLE_ASSERTIONS +void libdeflate_assertion_failed(const char *expr, const char *file, int line); +#define ASSERT(expr) { if (unlikely(!(expr))) \ + libdeflate_assertion_failed(#expr, __FILE__, __LINE__); } +#else +#define ASSERT(expr) (void)(expr) +#endif + #endif /* LIB_LIB_COMMON_H */ diff --git a/src/3rdparty/libdeflate/lib/utils.c b/src/3rdparty/libdeflate/lib/utils.c index c626af1f2..46826ef48 100644 --- a/src/3rdparty/libdeflate/lib/utils.c +++ b/src/3rdparty/libdeflate/lib/utils.c @@ -140,3 +140,14 @@ memcmp(const void *s1, const void *s2, size_t n) return 0; } #endif /* FREESTANDING */ + +#ifdef LIBDEFLATE_ENABLE_ASSERTIONS +#include +#include +void +libdeflate_assertion_failed(const char *expr, const char *file, int line) +{ + fprintf(stderr, "Assertion failed: %s at %s:%d\n", expr, file, line); + abort(); +} +#endif /* LIBDEFLATE_ENABLE_ASSERTIONS */ diff --git a/src/3rdparty/libdeflate/libdeflate.h b/src/3rdparty/libdeflate/libdeflate.h index 4c72ef959..c51dcf998 100644 --- a/src/3rdparty/libdeflate/libdeflate.h +++ b/src/3rdparty/libdeflate/libdeflate.h @@ -10,8 +10,8 @@ extern "C" { #endif #define LIBDEFLATE_VERSION_MAJOR 1 -#define LIBDEFLATE_VERSION_MINOR 9 -#define LIBDEFLATE_VERSION_STRING "1.9" +#define LIBDEFLATE_VERSION_MINOR 11 +#define LIBDEFLATE_VERSION_STRING "1.11" #include #include @@ -31,7 +31,7 @@ extern "C" { # define LIBDEFLATEEXPORT #endif -#if defined(_WIN32) && !defined(_WIN64) +#if defined(_WIN32) && !defined(_WIN64) && defined(LIBDEFLATE_DLL) # define LIBDEFLATEAPI_ABI __stdcall #else # define LIBDEFLATEAPI_ABI