drop BLAKE3 in favour of custom hash algorithm

faster than BLAKE3 and the other algorithms with collisions risk reduced based on the length of the input data, benchmark result: ********* Start testing of tst_qcryptographichash ********* Config: Using QTest library 4.12.0, Katie 4.12.0 PASS : tst_qcryptographichash::initTestCase() RESULT : tst_qcryptographichash::append():"10 (Md5)": 0.00280 msecs per iteration (total: 561, iterations: 200000) RESULT : tst_qcryptographichash::append():"10 (Sha1)": 0.00333 msecs per iteration (total: 667, iterations: 200000) RESULT : tst_qcryptographichash::append():"10 (Sha256)": 0.00467 msecs per iteration (total: 934, iterations: 200000) RESULT : tst_qcryptographichash::append():"10 (Sha512)": 0.00361 msecs per iteration (total: 723, iterations: 200000) RESULT : tst_qcryptographichash::append():"10 (KAT)": 0.00219 msecs per iteration (total: 439, iterations: 200000) RESULT : tst_qcryptographichash::append():"100 (Md5)": 0.000620 msecs per iteration (total: 124, iterations: 200000) RESULT : tst_qcryptographichash::append():"100 (Sha1)": 0.00109 msecs per iteration (total: 219, iterations: 200000) RESULT : tst_qcryptographichash::append():"100 (Sha256)": 0.000900 msecs per iteration (total: 180, iterations: 200000) RESULT : tst_qcryptographichash::append():"100 (Sha512)": 0.00106 msecs per iteration (total: 212, iterations: 200000) RESULT : tst_qcryptographichash::append():"100 (KAT)": 0.000740 msecs per iteration (total: 148, iterations: 200000) RESULT : tst_qcryptographichash::append():"250 (Md5)": 0.000580 msecs per iteration (total: 116, iterations: 200000) RESULT : tst_qcryptographichash::append():"250 (Sha1)": 0.00134 msecs per iteration (total: 268, iterations: 200000) RESULT : tst_qcryptographichash::append():"250 (Sha256)": 0.000845 msecs per iteration (total: 169, iterations: 200000) RESULT : tst_qcryptographichash::append():"250 (Sha512)": 0.00100 msecs per iteration (total: 200, iterations: 200000) RESULT : tst_qcryptographichash::append():"250 (KAT)": 0.000625 msecs per iteration (total: 125, iterations: 200000) RESULT : tst_qcryptographichash::append():"500 (Md5)": 0.000550 msecs per iteration (total: 110, iterations: 200000) RESULT : tst_qcryptographichash::append():"500 (Sha1)": 0.00137 msecs per iteration (total: 274, iterations: 200000) RESULT : tst_qcryptographichash::append():"500 (Sha256)": 0.000830 msecs per iteration (total: 166, iterations: 200000) RESULT : tst_qcryptographichash::append():"500 (Sha512)": 0.000985 msecs per iteration (total: 197, iterations: 200000) RESULT : tst_qcryptographichash::append():"500 (KAT)": 0.000575 msecs per iteration (total: 115, iterations: 200000) PASS : tst_qcryptographichash::append() RESULT : tst_qcryptographichash::append_once():"Md5": 0.00155 msecs per iteration (total: 310, iterations: 200000) RESULT : tst_qcryptographichash::append_once():"Sha1": 0.00212 msecs per iteration (total: 424, iterations: 200000) RESULT : tst_qcryptographichash::append_once():"Sha256": 0.00414 msecs per iteration (total: 828, iterations: 200000) RESULT : tst_qcryptographichash::append_once():"Sha512": 0.00314 msecs per iteration (total: 629, iterations: 200000) RESULT : tst_qcryptographichash::append_once():"KAT": 0.000805 msecs per iteration (total: 161, iterations: 200000) PASS : tst_qcryptographichash::append_once() RESULT : tst_qcryptographichash::statichash():"Md5": 0.00149 msecs per iteration (total: 298, iterations: 200000) RESULT : tst_qcryptographichash::statichash():"Sha1": 0.00206 msecs per iteration (total: 413, iterations: 200000) RESULT : tst_qcryptographichash::statichash():"Sha256": 0.00408 msecs per iteration (total: 817, iterations: 200000) RESULT : tst_qcryptographichash::statichash():"Sha512": 0.00309 msecs per iteration (total: 618, iterations: 200000) RESULT : tst_qcryptographichash::statichash():"KAT": 0.000610 msecs per iteration (total: 122, iterations: 200000) PASS : tst_qcryptographichash::statichash() PASS : tst_qcryptographichash::cleanupTestCase() Totals: 5 passed, 0 failed, 0 skipped ********* Finished testing of tst_qcryptographichash ********* Signed-off-by: Ivailo Monev <xakepa10@gmail.com>
2025-02-23 10:22:55 +00:00 · 2022-03-13 22:59:25 +02:00 · 2022-03-13 22:59:25 +02:00 · 3227047514
commit 3227047514
parent c228fd6100
19 changed files with 6440 additions and 1981 deletions
--- a/2
+++ b/2
@ -50,6 +50,7 @@ There are several things you should be aware before considering Katie:

 - some additions have been made:
  - custom image format
+  - custom hash algorithm
  - JSON document handler
  - standard directories handler
  - building with LTO is possible and supported, if the toolchain can handle it
@ -59,7 +60,6 @@ There are several things you should be aware before considering Katie:
  - support for AArch64 architecture
  - support for locale aliases
  - support for generating SHA-256 and SHA-512 hash sums (SHA-2)
-  - support for generating BLAKE3 hash sums
  - verification section for plugins build with Clang
  - qCompress() and qUncompress() use libdeflate which is much faster
  - stack backtrace on assert, crash or warning via execinfo
--- a/src/3rdparty/BLAKE3/LICENSE
+++ b/src/3rdparty/BLAKE3/LICENSE
@ -1,330 +0,0 @@
-This work is released into the public domain with CC0 1.0. Alternatively, it is
-licensed under the Apache License 2.0.
-
-------------------------------------------------------------------------------
-
-Creative Commons Legal Code
-
-CC0 1.0 Universal
-
-    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
-    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
-    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
-    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
-    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
-    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
-    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
-    HEREUNDER.
-
-Statement of Purpose
-
-The laws of most jurisdictions throughout the world automatically confer
-exclusive Copyright and Related Rights (defined below) upon the creator
-and subsequent owner(s) (each and all, an "owner") of an original work of
-authorship and/or a database (each, a "Work").
-
-Certain owners wish to permanently relinquish those rights to a Work for
-the purpose of contributing to a commons of creative, cultural and
-scientific works ("Commons") that the public can reliably and without fear
-of later claims of infringement build upon, modify, incorporate in other
-works, reuse and redistribute as freely as possible in any form whatsoever
-and for any purposes, including without limitation commercial purposes.
-These owners may contribute to the Commons to promote the ideal of a free
-culture and the further production of creative, cultural and scientific
-works, or to gain reputation or greater distribution for their Work in
-part through the use and efforts of others.
-
-For these and/or other purposes and motivations, and without any
-expectation of additional consideration or compensation, the person
-associating CC0 with a Work (the "Affirmer"), to the extent that he or she
-is an owner of Copyright and Related Rights in the Work, voluntarily
-elects to apply CC0 to the Work and publicly distribute the Work under its
-terms, with knowledge of his or her Copyright and Related Rights in the
-Work and the meaning and intended legal effect of CC0 on those rights.
-
-1. Copyright and Related Rights. A Work made available under CC0 may be
-protected by copyright and related or neighboring rights ("Copyright and
-Related Rights"). Copyright and Related Rights include, but are not
-limited to, the following:
-
-  i. the right to reproduce, adapt, distribute, perform, display,
-     communicate, and translate a Work;
- ii. moral rights retained by the original author(s) and/or performer(s);
-iii. publicity and privacy rights pertaining to a person's image or
-     likeness depicted in a Work;
- iv. rights protecting against unfair competition in regards to a Work,
-     subject to the limitations in paragraph 4(a), below;
-  v. rights protecting the extraction, dissemination, use and reuse of data
-     in a Work;
- vi. database rights (such as those arising under Directive 96/9/EC of the
-     European Parliament and of the Council of 11 March 1996 on the legal
-     protection of databases, and under any national implementation
-     thereof, including any amended or successor version of such
-     directive); and
-vii. other similar, equivalent or corresponding rights throughout the
-     world based on applicable law or treaty, and any national
-     implementations thereof.
-
-2. Waiver. To the greatest extent permitted by, but not in contravention
-of, applicable law, Affirmer hereby overtly, fully, permanently,
-irrevocably and unconditionally waives, abandons, and surrenders all of
-Affirmer's Copyright and Related Rights and associated claims and causes
-of action, whether now known or unknown (including existing as well as
-future claims and causes of action), in the Work (i) in all territories
-worldwide, (ii) for the maximum duration provided by applicable law or
-treaty (including future time extensions), (iii) in any current or future
-medium and for any number of copies, and (iv) for any purpose whatsoever,
-including without limitation commercial, advertising or promotional
-purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
-member of the public at large and to the detriment of Affirmer's heirs and
-successors, fully intending that such Waiver shall not be subject to
-revocation, rescission, cancellation, termination, or any other legal or
-equitable action to disrupt the quiet enjoyment of the Work by the public
-as contemplated by Affirmer's express Statement of Purpose.
-
-3. Public License Fallback. Should any part of the Waiver for any reason
-be judged legally invalid or ineffective under applicable law, then the
-Waiver shall be preserved to the maximum extent permitted taking into
-account Affirmer's express Statement of Purpose. In addition, to the
-extent the Waiver is so judged Affirmer hereby grants to each affected
-person a royalty-free, non transferable, non sublicensable, non exclusive,
-irrevocable and unconditional license to exercise Affirmer's Copyright and
-Related Rights in the Work (i) in all territories worldwide, (ii) for the
-maximum duration provided by applicable law or treaty (including future
-time extensions), (iii) in any current or future medium and for any number
-of copies, and (iv) for any purpose whatsoever, including without
-limitation commercial, advertising or promotional purposes (the
-"License"). The License shall be deemed effective as of the date CC0 was
-applied by Affirmer to the Work. Should any part of the License for any
-reason be judged legally invalid or ineffective under applicable law, such
-partial invalidity or ineffectiveness shall not invalidate the remainder
-of the License, and in such case Affirmer hereby affirms that he or she
-will not (i) exercise any of his or her remaining Copyright and Related
-Rights in the Work or (ii) assert any associated claims and causes of
-action with respect to the Work, in either case contrary to Affirmer's
-express Statement of Purpose.
-
-4. Limitations and Disclaimers.
-
- a. No trademark or patent rights held by Affirmer are waived, abandoned,
-    surrendered, licensed or otherwise affected by this document.
- b. Affirmer offers the Work as-is and makes no representations or
-    warranties of any kind concerning the Work, express, implied,
-    statutory or otherwise, including without limitation warranties of
-    title, merchantability, fitness for a particular purpose, non
-    infringement, or the absence of latent or other defects, accuracy, or
-    the present or absence of errors, whether or not discoverable, all to
-    the greatest extent permissible under applicable law.
- c. Affirmer disclaims responsibility for clearing rights of other persons
-    that may apply to the Work or any use thereof, including without
-    limitation any person's Copyright and Related Rights in the Work.
-    Further, Affirmer disclaims responsibility for obtaining any necessary
-    consents, permissions or other rights required for any use of the
-    Work.
- d. Affirmer understands and acknowledges that Creative Commons is not a
-    party to this document and has no duty or obligation with respect to
-    this CC0 or use of the Work.
-
-------------------------------------------------------------------------------
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2019 Jack O'Connor and Samuel Neves
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--- a/src/3rdparty/BLAKE3/NOTE
+++ b/src/3rdparty/BLAKE3/NOTE
@ -1,2 +0,0 @@
-This is Git checkout 9cd41c0cfdc2fa2fcb8261ff3e1446d98495c991
-from https://github.com/BLAKE3-team/BLAKE3 that has been modified.
--- a/src/3rdparty/BLAKE3/README.md
+++ b/src/3rdparty/BLAKE3/README.md
@ -1,203 +0,0 @@
-# <a href="#"><img src="media/BLAKE3.svg" alt="BLAKE3" height=50></a>
-
-BLAKE3 is a cryptographic hash function that is:
-
- **Much faster** than MD5, SHA-1, SHA-2, SHA-3, and BLAKE2.
- **Secure**, unlike MD5 and SHA-1. And secure against length extension,
-  unlike SHA-2.
- **Highly parallelizable** across any number of threads and SIMD lanes,
-  because it's a Merkle tree on the inside.
- Capable of **verified streaming** and **incremental updates**, again
-  because it's a Merkle tree.
- A **PRF**, **MAC**, **KDF**, and **XOF**, as well as a regular hash.
- **One algorithm with no variants**, which is fast on x86-64 and also
-  on smaller architectures.
-
-The [chart below](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/benchmarks/bar_chart.py)
-is an example benchmark of 16 KiB inputs on modern server hardware (a Cascade
-Lake-SP 8275CL processor). For more detailed benchmarks, see the
-[BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf).
-
-<p align="center">
-<img src="media/speed.svg" alt="performance graph">
-</p>
-
-BLAKE3 is based on an optimized instance of the established hash
-function [BLAKE2](https://blake2.net) and on the [original Bao tree
-mode](https://github.com/oconnor663/bao/blob/master/docs/spec_0.9.1.md).
-The specifications and design rationale are available in the [BLAKE3
-paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf).
-The default output size is 256 bits. The current version of
-[Bao](https://github.com/oconnor663/bao) implements verified streaming
-with BLAKE3.
-
-This repository is the official implementation of BLAKE3. It includes:
-
-* The [`blake3`](https://crates.io/crates/blake3) Rust crate, which
-  includes optimized implementations for SSE2, SSE4.1, AVX2, AVX-512,
-  and NEON, with automatic runtime CPU feature detection on x86. The
-  `rayon` feature provides multithreading.
-
-* The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which
-  provides a command line interface. It uses multithreading by default,
-  making it an order of magnitude faster than e.g. `sha256sum` on
-  typical desktop hardware.
-
-* The [C implementation](c), which like the Rust implementation includes
-  SIMD code and runtime CPU feature detection on x86. Unlike the Rust
-  implementation, it's not currently multithreaded. See
-  [`c/README.md`](c/README.md).
-
-* The [Rust reference implementation](reference_impl/reference_impl.rs),
-  which is discussed in Section 5.1 of the [BLAKE3
-  paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf).
-  This implementation is much smaller and simpler than the optimized
-  ones above. If you want to see how BLAKE3 works, or you're writing a
-  port that doesn't need multithreading or SIMD optimizations, start
-  here. Ports of the reference implementation to other languages are
-  hosted in separate repositories
-  ([C](https://github.com/oconnor663/blake3_reference_impl_c),
-  [Python](https://github.com/oconnor663/pure_python_blake3)).
-
-* A [set of test
-  vectors](https://github.com/BLAKE3-team/BLAKE3/blob/master/test_vectors/test_vectors.json)
-  that covers extended outputs, all three modes, and a variety of input
-  lengths.
-
-* [![Actions Status](https://github.com/BLAKE3-team/BLAKE3/workflows/tests/badge.svg)](https://github.com/BLAKE3-team/BLAKE3/actions)
-
-BLAKE3 was designed by:
-
-* [@oconnor663 ](https://github.com/oconnor663) (Jack O'Connor)
-* [@sneves](https://github.com/sneves) (Samuel Neves)
-* [@veorq](https://github.com/veorq) (Jean-Philippe Aumasson)
-* [@zookozcash](https://github.com/zookozcash) (Zooko)
-
-The development of BLAKE3 was sponsored by [Electric Coin Company](https://electriccoin.co).
-
-*NOTE: BLAKE3 is not a password hashing algorithm, because it's
-designed to be fast, whereas password hashing should not be fast. If you
-hash passwords to store the hashes or if you derive keys from passwords,
-we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).*
-
-## Usage
-
-### The `b3sum` utility
-
-The `b3sum` command line utility prints the BLAKE3 hashes of files or of
-standard input. Prebuilt binaries are available for Linux, Windows, and
-macOS (requiring the [unidentified developer
-workaround](https://support.apple.com/guide/mac-help/open-a-mac-app-from-an-unidentified-developer-mh40616/mac))
-on the [releases page](https://github.com/BLAKE3-team/BLAKE3/releases).
-If you've [installed Rust and
-Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html),
-you can also build `b3sum` yourself with:
-
-```bash
-cargo install b3sum
-```
-
-If `rustup` didn't configure your `PATH` for you, you might need to go
-looking for the installed binary in e.g. `~/.cargo/bin`. You can test
-out how fast BLAKE3 is on your machine by creating a big file and
-hashing it, for example:
-
-```bash
-# Create a 1 GB file.
-head -c 1000000000 /dev/zero > /tmp/bigfile
-# Hash it with SHA-256.
-time openssl sha256 /tmp/bigfile
-# Hash it with BLAKE3.
-time b3sum /tmp/bigfile
-```
-
-### The `blake3` crate [![docs.rs](https://docs.rs/blake3/badge.svg)](https://docs.rs/blake3)
-
-To use BLAKE3 from Rust code, add a dependency on the `blake3` crate to
-your `Cargo.toml`. Here's an example of hashing some input bytes:
-
-```rust
-// Hash an input all at once.
-let hash1 = blake3::hash(b"foobarbaz");
-
-// Hash an input incrementally.
-let mut hasher = blake3::Hasher::new();
-hasher.update(b"foo");
-hasher.update(b"bar");
-hasher.update(b"baz");
-let hash2 = hasher.finalize();
-assert_eq!(hash1, hash2);
-
-// Extended output. OutputReader also implements Read and Seek.
-let mut output = [0; 1000];
-let mut output_reader = hasher.finalize_xof();
-output_reader.fill(&mut output);
-assert_eq!(&output[..32], hash1.as_bytes());
-
-// Print a hash as hex.
-println!("{}", hash1);
-```
-
-Besides `hash`, BLAKE3 provides two other modes, `keyed_hash` and
-`derive_key`. The `keyed_hash` mode takes a 256-bit key:
-
-```rust
-// MAC an input all at once.
-let example_key = [42u8; 32];
-let mac1 = blake3::keyed_hash(&example_key, b"example input");
-
-// MAC incrementally.
-let mut hasher = blake3::Hasher::new_keyed(&example_key);
-hasher.update(b"example input");
-let mac2 = hasher.finalize();
-assert_eq!(mac1, mac2);
-```
-
-The `derive_key` mode takes a context string and some key material (not a
-password). The context string should be hardcoded, globally unique, and
-application-specific. A good default format for the context string is
-`"[application] [commit timestamp] [purpose]"`:
-
-```rust
-// Derive a couple of subkeys for different purposes.
-const EMAIL_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:10:44 email key";
-const API_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:11:21 API key";
-let input_key_material = b"usually at least 32 random bytes, not a password";
-let email_key = blake3::derive_key(EMAIL_CONTEXT, input_key_material);
-let api_key = blake3::derive_key(API_CONTEXT, input_key_material);
-assert_ne!(email_key, api_key);
-```
-
-### The C implementation
-
-See [`c/README.md`](c/README.md).
-
-### Other implementations
-
-We post links to third-party bindings and implementations on the
-[@BLAKE3team Twitter account](https://twitter.com/BLAKE3team) whenever
-we hear about them. Some highlights include [an optimized Go
-implementation](https://github.com/zeebo/blake3), [Wasm bindings for
-Node.js and browsers](https://github.com/connor4312/blake3), [binary
-wheels for Python](https://github.com/oconnor663/blake3-py), [.NET
-bindings](https://github.com/xoofx/Blake3.NET), and [JNI
-bindings](https://github.com/sken77/BLAKE3jni).
-
-## Contributing
-
-Please see [CONTRIBUTING.md](CONTRIBUTING.md).
-
-## Intellectual property
-
-The Rust code is copyright Jack O'Connor, 2019-2020. The C code is
-copyright Samuel Neves and Jack O'Connor, 2019-2020. The assembly code
-is copyright Samuel Neves, 2019-2020.
-
-This work is released into the public domain with CC0 1.0.
-Alternatively, it is licensed under the Apache License 2.0.
-
-## Miscellany
-
- [@veorq](https://github.com/veorq) and
-  [@oconnor663](https://github.com/oconnor663) did [a podcast
-  interview](https://www.cryptography.fm/3) about designing BLAKE3.
--- a/src/3rdparty/BLAKE3/blake3.c
+++ b/src/3rdparty/BLAKE3/blake3.c
@ -1,616 +0,0 @@
-#include <assert.h>
-#include <stdbool.h>
-#include <string.h>
-
-#include "blake3.h"
-#include "blake3_impl.h"
-
-const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }
-
-INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
-                             uint8_t flags) {
-  memcpy(self->cv, key, BLAKE3_KEY_LEN);
-  self->chunk_counter = 0;
-  memset(self->buf, 0, BLAKE3_BLOCK_LEN);
-  self->buf_len = 0;
-  self->blocks_compressed = 0;
-  self->flags = flags;
-}
-
-INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8],
-                              uint64_t chunk_counter) {
-  memcpy(self->cv, key, BLAKE3_KEY_LEN);
-  self->chunk_counter = chunk_counter;
-  self->blocks_compressed = 0;
-  memset(self->buf, 0, BLAKE3_BLOCK_LEN);
-  self->buf_len = 0;
-}
-
-INLINE size_t chunk_state_len(const blake3_chunk_state *self) {
-  return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) +
-         ((size_t)self->buf_len);
-}
-
-INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self,
-                                   const uint8_t *input, size_t input_len) {
-  size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len);
-  if (take > input_len) {
-    take = input_len;
-  }
-  uint8_t *dest = self->buf + ((size_t)self->buf_len);
-  memcpy(dest, input, take);
-  self->buf_len += (uint8_t)take;
-  return take;
-}
-
-INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) {
-  if (self->blocks_compressed == 0) {
-    return CHUNK_START;
-  } else {
-    return 0;
-  }
-}
-
-typedef struct {
-  uint32_t input_cv[8];
-  uint64_t counter;
-  uint8_t block[BLAKE3_BLOCK_LEN];
-  uint8_t block_len;
-  uint8_t flags;
-} output_t;
-
-INLINE output_t make_output(const uint32_t input_cv[8],
-                            const uint8_t block[BLAKE3_BLOCK_LEN],
-                            uint8_t block_len, uint64_t counter,
-                            uint8_t flags) {
-  output_t ret;
-  memcpy(ret.input_cv, input_cv, 32);
-  memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
-  ret.block_len = block_len;
-  ret.counter = counter;
-  ret.flags = flags;
-  return ret;
-}
-
-// Chaining values within a given chunk (specifically the compress_in_place
-// interface) are represented as words. This avoids unnecessary bytes<->words
-// conversion overhead in the portable implementation. However, the hash_many
-// interface handles both user input and parent node blocks, so it accepts
-// bytes. For that reason, chaining values in the CV stack are represented as
-// bytes.
-INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
-  uint32_t cv_words[8];
-  memcpy(cv_words, self->input_cv, 32);
-  blake3_compress_in_place(cv_words, self->block, self->block_len,
-                           self->counter, self->flags);
-  store_cv_words(cv, cv_words);
-}
-
-INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
-                              size_t out_len) {
-  uint64_t output_block_counter = seek / 64;
-  size_t offset_within_block = seek % 64;
-  uint8_t wide_buf[64];
-  while (out_len > 0) {
-    blake3_compress_xof(self->input_cv, self->block, self->block_len,
-                        output_block_counter, self->flags | ROOT, wide_buf);
-    size_t available_bytes = 64 - offset_within_block;
-    size_t memcpy_len;
-    if (out_len > available_bytes) {
-      memcpy_len = available_bytes;
-    } else {
-      memcpy_len = out_len;
-    }
-    memcpy(out, wide_buf + offset_within_block, memcpy_len);
-    out += memcpy_len;
-    out_len -= memcpy_len;
-    output_block_counter += 1;
-    offset_within_block = 0;
-  }
-}
-
-INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input,
-                               size_t input_len) {
-  if (self->buf_len > 0) {
-    size_t take = chunk_state_fill_buf(self, input, input_len);
-    input += take;
-    input_len -= take;
-    if (input_len > 0) {
-      blake3_compress_in_place(
-          self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter,
-          self->flags | chunk_state_maybe_start_flag(self));
-      self->blocks_compressed += 1;
-      self->buf_len = 0;
-      memset(self->buf, 0, BLAKE3_BLOCK_LEN);
-    }
-  }
-
-  while (input_len > BLAKE3_BLOCK_LEN) {
-    blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN,
-                             self->chunk_counter,
-                             self->flags | chunk_state_maybe_start_flag(self));
-    self->blocks_compressed += 1;
-    input += BLAKE3_BLOCK_LEN;
-    input_len -= BLAKE3_BLOCK_LEN;
-  }
-
-  size_t take = chunk_state_fill_buf(self, input, input_len);
-  input += take;
-  input_len -= take;
-}
-
-INLINE output_t chunk_state_output(const blake3_chunk_state *self) {
-  uint8_t block_flags =
-      self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
-  return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter,
-                     block_flags);
-}
-
-INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
-                              const uint32_t key[8], uint8_t flags) {
-  return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
-}
-
-// Given some input larger than one chunk, return the number of bytes that
-// should go in the left subtree. This is the largest power-of-2 number of
-// chunks that leaves at least 1 byte for the right subtree.
-INLINE size_t left_len(size_t content_len) {
-  // Subtract 1 to reserve at least one byte for the right side. content_len
-  // should always be greater than BLAKE3_CHUNK_LEN.
-  size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
-  return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
-}
-
-// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
-// on a single thread. Write out the chunk chaining values and return the
-// number of chunks hashed. These chunks are never the root and never empty;
-// those cases use a different codepath.
-INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len,
-                                       const uint32_t key[8],
-                                       uint64_t chunk_counter, uint8_t flags,
-                                       uint8_t *out) {
-#if defined(BLAKE3_TESTING)
-  assert(0 < input_len);
-  assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN);
-#endif
-
-  const uint8_t *chunks_array[MAX_SIMD_DEGREE];
-  size_t input_position = 0;
-  size_t chunks_array_len = 0;
-  while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
-    chunks_array[chunks_array_len] = &input[input_position];
-    input_position += BLAKE3_CHUNK_LEN;
-    chunks_array_len += 1;
-  }
-
-  blake3_hash_many(chunks_array, chunks_array_len,
-                   BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter,
-                   true, flags, CHUNK_START, CHUNK_END, out);
-
-  // Hash the remaining partial chunk, if there is one. Note that the empty
-  // chunk (meaning the empty message) is a different codepath.
-  if (input_len > input_position) {
-    uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
-    blake3_chunk_state chunk_state;
-    chunk_state_init(&chunk_state, key, flags);
-    chunk_state.chunk_counter = counter;
-    chunk_state_update(&chunk_state, &input[input_position],
-                       input_len - input_position);
-    output_t output = chunk_state_output(&chunk_state);
-    output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
-    return chunks_array_len + 1;
-  } else {
-    return chunks_array_len;
-  }
-}
-
-// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
-// on a single thread. Write out the parent chaining values and return the
-// number of parents hashed. (If there's an odd input chaining value left over,
-// return it as an additional output.) These parents are never the root and
-// never empty; those cases use a different codepath.
-INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
-                                        size_t num_chaining_values,
-                                        const uint32_t key[8], uint8_t flags,
-                                        uint8_t *out) {
-#if defined(BLAKE3_TESTING)
-  assert(2 <= num_chaining_values);
-  assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2);
-#endif
-
-  const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
-  size_t parents_array_len = 0;
-  while (num_chaining_values - (2 * parents_array_len) >= 2) {
-    parents_array[parents_array_len] =
-        &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
-    parents_array_len += 1;
-  }
-
-  blake3_hash_many(parents_array, parents_array_len, 1, key,
-                   0, // Parents always use counter 0.
-                   false, flags | PARENT,
-                   0, // Parents have no start flags.
-                   0, // Parents have no end flags.
-                   out);
-
-  // If there's an odd child left over, it becomes an output.
-  if (num_chaining_values > 2 * parents_array_len) {
-    memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
-           &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
-           BLAKE3_OUT_LEN);
-    return parents_array_len + 1;
-  } else {
-    return parents_array_len;
-  }
-}
-
-// The wide helper function returns (writes out) an array of chaining values
-// and returns the length of that array. The number of chaining values returned
-// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
-// if the input is shorter than that many chunks. The reason for maintaining a
-// wide array of chaining values going back up the tree, is to allow the
-// implementation to hash as many parents in parallel as possible.
-//
-// As a special case when the SIMD degree is 1, this function will still return
-// at least 2 outputs. This guarantees that this function doesn't perform the
-// root compression. (If it did, it would use the wrong flags, and also we
-// wouldn't be able to implement exendable output.) Note that this function is
-// not used when the whole input is only 1 chunk long; that's a different
-// codepath.
-//
-// Why not just have the caller split the input on the first update(), instead
-// of implementing this special rule? Because we don't want to limit SIMD or
-// multi-threading parallelism for that update().
-static size_t blake3_compress_subtree_wide(const uint8_t *input,
-                                           size_t input_len,
-                                           const uint32_t key[8],
-                                           uint64_t chunk_counter,
-                                           uint8_t flags, uint8_t *out) {
-  // Note that the single chunk case does *not* bump the SIMD degree up to 2
-  // when it is 1. If this implementation adds multi-threading in the future,
-  // this gives us the option of multi-threading even the 2-chunk case, which
-  // can help performance on smaller platforms.
-  if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) {
-    return compress_chunks_parallel(input, input_len, key, chunk_counter, flags,
-                                    out);
-  }
-
-  // With more than simd_degree chunks, we need to recurse. Start by dividing
-  // the input into left and right subtrees. (Note that this is only optimal
-  // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
-  // of 3 or something, we'll need a more complicated strategy.)
-  size_t left_input_len = left_len(input_len);
-  size_t right_input_len = input_len - left_input_len;
-  const uint8_t *right_input = &input[left_input_len];
-  uint64_t right_chunk_counter =
-      chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);
-
-  // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
-  // account for the special case of returning 2 outputs when the SIMD degree
-  // is 1.
-  uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
-  size_t degree = blake3_simd_degree();
-  if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
-    // The special case: We always use a degree of at least two, to make
-    // sure there are two outputs. Except, as noted above, at the chunk
-    // level, where we allow degree=1. (Note that the 1-chunk-input case is
-    // a different codepath.)
-    degree = 2;
-  }
-  uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
-
-  // Recurse! If this implementation adds multi-threading support in the
-  // future, this is where it will go.
-  size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
-                                               chunk_counter, flags, cv_array);
-  size_t right_n = blake3_compress_subtree_wide(
-      right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
-
-  // The special case again. If simd_degree=1, then we'll have left_n=1 and
-  // right_n=1. Rather than compressing them into a single output, return
-  // them directly, to make sure we always have at least two outputs.
-  if (left_n == 1) {
-    memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
-    return 2;
-  }
-
-  // Otherwise, do one layer of parent node compression.
-  size_t num_chaining_values = left_n + right_n;
-  return compress_parents_parallel(cv_array, num_chaining_values, key, flags,
-                                   out);
-}
-
-// Hash a subtree with compress_subtree_wide(), and then condense the resulting
-// list of chaining values down to a single parent node. Don't compress that
-// last parent node, however. Instead, return its message bytes (the
-// concatenated chaining values of its children). This is necessary when the
-// first call to update() supplies a complete subtree, because the topmost
-// parent node of that subtree could end up being the root. It's also necessary
-// for extended output in the general case.
-//
-// As with compress_subtree_wide(), this function is not used on inputs of 1
-// chunk or less. That's a different codepath.
-INLINE void compress_subtree_to_parent_node(
-    const uint8_t *input, size_t input_len, const uint32_t key[8],
-    uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) {
-#if defined(BLAKE3_TESTING)
-  assert(input_len > BLAKE3_CHUNK_LEN);
-#endif
-
-  uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
-  size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
-                                                chunk_counter, flags, cv_array);
-  assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
-
-  // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
-  // compress_subtree_wide() returns more than 2 chaining values. Condense
-  // them into 2 by forming parent nodes repeatedly.
-  uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
-  // The second half of this loop condition is always true, and we just
-  // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
-  // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
-  // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
-  // this code, test it against that version.
-  while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
-    num_cvs =
-        compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
-    memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
-  }
-  memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
-}
-
-INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8],
-                             uint8_t flags) {
-  memcpy(self->key, key, BLAKE3_KEY_LEN);
-  chunk_state_init(&self->chunk, key, flags);
-  self->cv_stack_len = 0;
-}
-
-void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); }
-
-void blake3_hasher_init_keyed(blake3_hasher *self,
-                              const uint8_t key[BLAKE3_KEY_LEN]) {
-  uint32_t key_words[8];
-  load_key_words(key, key_words);
-  hasher_init_base(self, key_words, KEYED_HASH);
-}
-
-void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
-                                       size_t context_len) {
-  blake3_hasher context_hasher;
-  hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
-  blake3_hasher_update(&context_hasher, context, context_len);
-  uint8_t context_key[BLAKE3_KEY_LEN];
-  blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
-  uint32_t context_key_words[8];
-  load_key_words(context_key, context_key_words);
-  hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
-}
-
-void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
-  blake3_hasher_init_derive_key_raw(self, context, strlen(context));
-}
-
-// As described in hasher_push_cv() below, we do "lazy merging", delaying
-// merges until right before the next CV is about to be added. This is
-// different from the reference implementation. Another difference is that we
-// aren't always merging 1 chunk at a time. Instead, each CV might represent
-// any power-of-two number of chunks, as long as the smaller-above-larger stack
-// order is maintained. Instead of the "count the trailing 0-bits" algorithm
-// described in the spec, we use a "count the total number of 1-bits" variant
-// that doesn't require us to retain the subtree size of the CV on top of the
-// stack. The principle is the same: each CV that should remain in the stack is
-// represented by a 1-bit in the total number of chunks (or bytes) so far.
-INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
-  size_t post_merge_stack_len = (size_t)popcnt(total_len);
-  while (self->cv_stack_len > post_merge_stack_len) {
-    uint8_t *parent_node =
-        &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN];
-    output_t output = parent_output(parent_node, self->key, self->chunk.flags);
-    output_chaining_value(&output, parent_node);
-    self->cv_stack_len -= 1;
-  }
-}
-
-// In reference_impl.rs, we merge the new CV with existing CVs from the stack
-// before pushing it. We can do that because we know more input is coming, so
-// we know none of the merges are root.
-//
-// This setting is different. We want to feed as much input as possible to
-// compress_subtree_wide(), without setting aside anything for the chunk_state.
-// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
-// as a single subtree, if at all possible.
-//
-// This leads to two problems:
-// 1) This 64 KiB input might be the only call that ever gets made to update.
-//    In this case, the root node of the 64 KiB subtree would be the root node
-//    of the whole tree, and it would need to be ROOT finalized. We can't
-//    compress it until we know.
-// 2) This 64 KiB input might complete a larger tree, whose root node is
-//    similarly going to be the the root of the whole tree. For example, maybe
-//    we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
-//    node at the root of the 256 KiB subtree until we know how to finalize it.
-//
-// The second problem is solved with "lazy merging". That is, when we're about
-// to add a CV to the stack, we don't merge it with anything first, as the
-// reference impl does. Instead we do merges using the *previous* CV that was
-// added, which is sitting on top of the stack, and we put the new CV
-// (unmerged) on top of the stack afterwards. This guarantees that we never
-// merge the root node until finalize().
-//
-// Solving the first problem requires an additional tool,
-// compress_subtree_to_parent_node(). That function always returns the top
-// *two* chaining values of the subtree it's compressing. We then do lazy
-// merging with each of them separately, so that the second CV will always
-// remain unmerged. (That also helps us support extendable output when we're
-// hashing an input all-at-once.)
-INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
-                           uint64_t chunk_counter) {
-  hasher_merge_cv_stack(self, chunk_counter);
-  memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv,
-         BLAKE3_OUT_LEN);
-  self->cv_stack_len += 1;
-}
-
-void blake3_hasher_update(blake3_hasher *self, const void *input,
-                          size_t input_len) {
-  // Explicitly checking for zero avoids causing UB by passing a null pointer
-  // to memcpy. This comes up in practice with things like:
-  //   std::vector<uint8_t> v;
-  //   blake3_hasher_update(&hasher, v.data(), v.size());
-  if (input_len == 0) {
-    return;
-  }
-
-  const uint8_t *input_bytes = (const uint8_t *)input;
-
-  // If we have some partial chunk bytes in the internal chunk_state, we need
-  // to finish that chunk first.
-  if (chunk_state_len(&self->chunk) > 0) {
-    size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
-    if (take > input_len) {
-      take = input_len;
-    }
-    chunk_state_update(&self->chunk, input_bytes, take);
-    input_bytes += take;
-    input_len -= take;
-    // If we've filled the current chunk and there's more coming, finalize this
-    // chunk and proceed. In this case we know it's not the root.
-    if (input_len > 0) {
-      output_t output = chunk_state_output(&self->chunk);
-      uint8_t chunk_cv[32];
-      output_chaining_value(&output, chunk_cv);
-      hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter);
-      chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
-    } else {
-      return;
-    }
-  }
-
-  // Now the chunk_state is clear, and we have more input. If there's more than
-  // a single chunk (so, definitely not the root chunk), hash the largest whole
-  // subtree we can, with the full benefits of SIMD (and maybe in the future,
-  // multi-threading) parallelism. Two restrictions:
-  // - The subtree has to be a power-of-2 number of chunks. Only subtrees along
-  //   the right edge can be incomplete, and we don't know where the right edge
-  //   is going to be until we get to finalize().
-  // - The subtree must evenly divide the total number of chunks up until this
-  //   point (if total is not 0). If the current incomplete subtree is only
-  //   waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
-  //   to complete the current subtree first.
-  // Because we might need to break up the input to form powers of 2, or to
-  // evenly divide what we already have, this part runs in a loop.
-  while (input_len > BLAKE3_CHUNK_LEN) {
-    size_t subtree_len = round_down_to_power_of_2(input_len);
-    uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
-    // Shrink the subtree_len until it evenly divides the count so far. We know
-    // that subtree_len itself is a power of 2, so we can use a bitmasking
-    // trick instead of an actual remainder operation. (Note that if the caller
-    // consistently passes power-of-2 inputs of the same size, as is hopefully
-    // typical, this loop condition will always fail, and subtree_len will
-    // always be the full length of the input.)
-    //
-    // An aside: We don't have to shrink subtree_len quite this much. For
-    // example, if count_so_far is 1, we could pass 2 chunks to
-    // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
-    // get the right answer in the end, and we might get to use 2-way SIMD
-    // parallelism. The problem with this optimization, is that it gets us
-    // stuck always hashing 2 chunks. The total number of chunks will remain
-    // odd, and we'll never graduate to higher degrees of parallelism. See
-    // https://github.com/BLAKE3-team/BLAKE3/issues/69.
-    while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
-      subtree_len /= 2;
-    }
-    // The shrunken subtree_len might now be 1 chunk long. If so, hash that one
-    // chunk by itself. Otherwise, compress the subtree into a pair of CVs.
-    uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
-    if (subtree_len <= BLAKE3_CHUNK_LEN) {
-      blake3_chunk_state chunk_state;
-      chunk_state_init(&chunk_state, self->key, self->chunk.flags);
-      chunk_state.chunk_counter = self->chunk.chunk_counter;
-      chunk_state_update(&chunk_state, input_bytes, subtree_len);
-      output_t output = chunk_state_output(&chunk_state);
-      uint8_t cv[BLAKE3_OUT_LEN];
-      output_chaining_value(&output, cv);
-      hasher_push_cv(self, cv, chunk_state.chunk_counter);
-    } else {
-      // This is the high-performance happy path, though getting here depends
-      // on the caller giving us a long enough input.
-      uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
-      compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
-                                      self->chunk.chunk_counter,
-                                      self->chunk.flags, cv_pair);
-      hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
-      hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
-                     self->chunk.chunk_counter + (subtree_chunks / 2));
-    }
-    self->chunk.chunk_counter += subtree_chunks;
-    input_bytes += subtree_len;
-    input_len -= subtree_len;
-  }
-
-  // If there's any remaining input less than a full chunk, add it to the chunk
-  // state. In that case, also do a final merge loop to make sure the subtree
-  // stack doesn't contain any unmerged pairs. The remaining input means we
-  // know these merges are non-root. This merge loop isn't strictly necessary
-  // here, because hasher_push_chunk_cv already does its own merge loop, but it
-  // simplifies blake3_hasher_finalize below.
-  if (input_len > 0) {
-    chunk_state_update(&self->chunk, input_bytes, input_len);
-    hasher_merge_cv_stack(self, self->chunk.chunk_counter);
-  }
-}
-
-void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
-                            size_t out_len) {
-  blake3_hasher_finalize_seek(self, 0, out, out_len);
-}
-
-void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
-                                 uint8_t *out, size_t out_len) {
-  // Explicitly checking for zero avoids causing UB by passing a null pointer
-  // to memcpy. This comes up in practice with things like:
-  //   std::vector<uint8_t> v;
-  //   blake3_hasher_finalize(&hasher, v.data(), v.size());
-  if (out_len == 0) {
-    return;
-  }
-
-  // If the subtree stack is empty, then the current chunk is the root.
-  if (self->cv_stack_len == 0) {
-    output_t output = chunk_state_output(&self->chunk);
-    output_root_bytes(&output, seek, out, out_len);
-    return;
-  }
-  // If there are any bytes in the chunk state, finalize that chunk and do a
-  // roll-up merge between that chunk hash and every subtree in the stack. In
-  // this case, the extra merge loop at the end of blake3_hasher_update
-  // guarantees that none of the subtrees in the stack need to be merged with
-  // each other first. Otherwise, if there are no bytes in the chunk state,
-  // then the top of the stack is a chunk hash, and we start the merge from
-  // that.
-  output_t output;
-  size_t cvs_remaining;
-  if (chunk_state_len(&self->chunk) > 0) {
-    cvs_remaining = self->cv_stack_len;
-    output = chunk_state_output(&self->chunk);
-  } else {
-    // There are always at least 2 CVs in the stack in this case.
-    cvs_remaining = self->cv_stack_len - 2;
-    output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key,
-                           self->chunk.flags);
-  }
-  while (cvs_remaining > 0) {
-    cvs_remaining -= 1;
-    uint8_t parent_block[BLAKE3_BLOCK_LEN];
-    memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32);
-    output_chaining_value(&output, &parent_block[32]);
-    output = parent_output(parent_block, self->key, self->chunk.flags);
-  }
-  output_root_bytes(&output, seek, out, out_len);
-}
-
-void blake3_hasher_reset(blake3_hasher *self) {
-  chunk_state_reset(&self->chunk, self->key, 0);
-  self->cv_stack_len = 0;
-}
--- a/src/3rdparty/BLAKE3/blake3.h
+++ b/src/3rdparty/BLAKE3/blake3.h
@ -1,65 +0,0 @@
-#ifndef BLAKE3_H
-#define BLAKE3_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#define BLAKE3_NO_SSE2
-#define BLAKE3_NO_SSE41
-#define BLAKE3_NO_AVX2
-#define BLAKE3_NO_AVX512
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define BLAKE3_VERSION_STRING "1.3.1"
-#define BLAKE3_KEY_LEN 32
-#define BLAKE3_OUT_LEN 32
-#define BLAKE3_BLOCK_LEN 64
-#define BLAKE3_CHUNK_LEN 1024
-#define BLAKE3_MAX_DEPTH 54
-
-// This struct is a private implementation detail. It has to be here because
-// it's part of blake3_hasher below.
-typedef struct {
-  uint32_t cv[8];
-  uint64_t chunk_counter;
-  uint8_t buf[BLAKE3_BLOCK_LEN];
-  uint8_t buf_len;
-  uint8_t blocks_compressed;
-  uint8_t flags;
-} blake3_chunk_state;
-
-typedef struct {
-  uint32_t key[8];
-  blake3_chunk_state chunk;
-  uint8_t cv_stack_len;
-  // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
-  // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
-  // requires a 4th entry, rather than merging everything down to 1, because we
-  // don't know whether more input is coming. This is different from how the
-  // reference implementation does things.
-  uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
-} blake3_hasher;
-
-const char *blake3_version(void);
-void blake3_hasher_init(blake3_hasher *self);
-void blake3_hasher_init_keyed(blake3_hasher *self,
-                              const uint8_t key[BLAKE3_KEY_LEN]);
-void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
-void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
-                                       size_t context_len);
-void blake3_hasher_update(blake3_hasher *self, const void *input,
-                          size_t input_len);
-void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
-                            size_t out_len);
-void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
-                                 uint8_t *out, size_t out_len);
-void blake3_hasher_reset(blake3_hasher *self);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* BLAKE3_H */
--- a/src/3rdparty/BLAKE3/blake3_dispatch.c
+++ b/src/3rdparty/BLAKE3/blake3_dispatch.c
@ -1,276 +0,0 @@
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include "blake3_impl.h"
-
-#if defined(IS_X86)
-#if defined(_MSC_VER)
-#include <intrin.h>
-#elif defined(__GNUC__)
-#include <immintrin.h>
-#else
-#error "Unimplemented!"
-#endif
-#endif
-
-#define MAYBE_UNUSED(x) (void)((x))
-
-#if defined(IS_X86)
-static uint64_t xgetbv() {
-#if defined(_MSC_VER)
-  return _xgetbv(0);
-#else
-  uint32_t eax = 0, edx = 0;
-  __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
-  return ((uint64_t)edx << 32) | eax;
-#endif
-}
-
-static void cpuid(uint32_t out[4], uint32_t id) {
-#if defined(_MSC_VER)
-  __cpuid((int *)out, id);
-#elif defined(__i386__) || defined(_M_IX86)
-  __asm__ __volatile__("movl %%ebx, %1\n"
-                       "cpuid\n"
-                       "xchgl %1, %%ebx\n"
-                       : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
-                       : "a"(id));
-#else
-  __asm__ __volatile__("cpuid\n"
-                       : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
-                       : "a"(id));
-#endif
-}
-
-static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
-#if defined(_MSC_VER)
-  __cpuidex((int *)out, id, sid);
-#elif defined(__i386__) || defined(_M_IX86)
-  __asm__ __volatile__("movl %%ebx, %1\n"
-                       "cpuid\n"
-                       "xchgl %1, %%ebx\n"
-                       : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
-                       : "a"(id), "c"(sid));
-#else
-  __asm__ __volatile__("cpuid\n"
-                       : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
-                       : "a"(id), "c"(sid));
-#endif
-}
-
-#endif
-
-enum cpu_feature {
-  SSE2 = 1 << 0,
-  SSSE3 = 1 << 1,
-  SSE41 = 1 << 2,
-  AVX = 1 << 3,
-  AVX2 = 1 << 4,
-  AVX512F = 1 << 5,
-  AVX512VL = 1 << 6,
-  /* ... */
-  UNDEFINED = 1 << 30
-};
-
-#if !defined(BLAKE3_TESTING)
-static /* Allow the variable to be controlled manually for testing */
-#endif
-    enum cpu_feature g_cpu_features = UNDEFINED;
-
-#if !defined(BLAKE3_TESTING)
-static
-#endif
-    enum cpu_feature
-    get_cpu_features() {
-
-  if (g_cpu_features != UNDEFINED) {
-    return g_cpu_features;
-  } else {
-#if defined(IS_X86)
-    uint32_t regs[4] = {0};
-    uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
-    (void)edx;
-    enum cpu_feature features = 0;
-    cpuid(regs, 0);
-    const int max_id = *eax;
-    cpuid(regs, 1);
-#if defined(__amd64__) || defined(_M_X64)
-    features |= SSE2;
-#else
-    if (*edx & (1UL << 26))
-      features |= SSE2;
-#endif
-    if (*ecx & (1UL << 0))
-      features |= SSSE3;
-    if (*ecx & (1UL << 19))
-      features |= SSE41;
-
-    if (*ecx & (1UL << 27)) { // OSXSAVE
-      const uint64_t mask = xgetbv();
-      if ((mask & 6) == 6) { // SSE and AVX states
-        if (*ecx & (1UL << 28))
-          features |= AVX;
-        if (max_id >= 7) {
-          cpuidex(regs, 7, 0);
-          if (*ebx & (1UL << 5))
-            features |= AVX2;
-          if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
-            if (*ebx & (1UL << 31))
-              features |= AVX512VL;
-            if (*ebx & (1UL << 16))
-              features |= AVX512F;
-          }
-        }
-      }
-    }
-    g_cpu_features = features;
-    return features;
-#else
-    /* How to detect NEON? */
-    return 0;
-#endif
-  }
-}
-
-void blake3_compress_in_place(uint32_t cv[8],
-                              const uint8_t block[BLAKE3_BLOCK_LEN],
-                              uint8_t block_len, uint64_t counter,
-                              uint8_t flags) {
-#if defined(IS_X86)
-  const enum cpu_feature features = get_cpu_features();
-  MAYBE_UNUSED(features);
-#if !defined(BLAKE3_NO_AVX512)
-  if (features & AVX512VL) {
-    blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
-    return;
-  }
-#endif
-#if !defined(BLAKE3_NO_SSE41)
-  if (features & SSE41) {
-    blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
-    return;
-  }
-#endif
-#if !defined(BLAKE3_NO_SSE2)
-  if (features & SSE2) {
-    blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
-    return;
-  }
-#endif
-#endif
-  blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
-}
-
-void blake3_compress_xof(const uint32_t cv[8],
-                         const uint8_t block[BLAKE3_BLOCK_LEN],
-                         uint8_t block_len, uint64_t counter, uint8_t flags,
-                         uint8_t out[64]) {
-#if defined(IS_X86)
-  const enum cpu_feature features = get_cpu_features();
-  MAYBE_UNUSED(features);
-#if !defined(BLAKE3_NO_AVX512)
-  if (features & AVX512VL) {
-    blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
-    return;
-  }
-#endif
-#if !defined(BLAKE3_NO_SSE41)
-  if (features & SSE41) {
-    blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
-    return;
-  }
-#endif
-#if !defined(BLAKE3_NO_SSE2)
-  if (features & SSE2) {
-    blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
-    return;
-  }
-#endif
-#endif
-  blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
-}
-
-void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
-                      size_t blocks, const uint32_t key[8], uint64_t counter,
-                      bool increment_counter, uint8_t flags,
-                      uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
-#if defined(IS_X86)
-  const enum cpu_feature features = get_cpu_features();
-  MAYBE_UNUSED(features);
-#if !defined(BLAKE3_NO_AVX512)
-  if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
-    blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
-                            increment_counter, flags, flags_start, flags_end,
-                            out);
-    return;
-  }
-#endif
-#if !defined(BLAKE3_NO_AVX2)
-  if (features & AVX2) {
-    blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
-                          increment_counter, flags, flags_start, flags_end,
-                          out);
-    return;
-  }
-#endif
-#if !defined(BLAKE3_NO_SSE41)
-  if (features & SSE41) {
-    blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
-                           increment_counter, flags, flags_start, flags_end,
-                           out);
-    return;
-  }
-#endif
-#if !defined(BLAKE3_NO_SSE2)
-  if (features & SSE2) {
-    blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
-                          increment_counter, flags, flags_start, flags_end,
-                          out);
-    return;
-  }
-#endif
-#endif
-
-#if BLAKE3_USE_NEON == 1
-  blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
-                        increment_counter, flags, flags_start, flags_end, out);
-  return;
-#endif
-
-  blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
-                            increment_counter, flags, flags_start, flags_end,
-                            out);
-}
-
-// The dynamically detected SIMD degree of the current platform.
-size_t blake3_simd_degree(void) {
-#if defined(IS_X86)
-  const enum cpu_feature features = get_cpu_features();
-  MAYBE_UNUSED(features);
-#if !defined(BLAKE3_NO_AVX512)
-  if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
-    return 16;
-  }
-#endif
-#if !defined(BLAKE3_NO_AVX2)
-  if (features & AVX2) {
-    return 8;
-  }
-#endif
-#if !defined(BLAKE3_NO_SSE41)
-  if (features & SSE41) {
-    return 4;
-  }
-#endif
-#if !defined(BLAKE3_NO_SSE2)
-  if (features & SSE2) {
-    return 4;
-  }
-#endif
-#endif
-#if BLAKE3_USE_NEON == 1
-  return 4;
-#endif
-  return 1;
-}
--- a/src/3rdparty/BLAKE3/blake3_impl.h
+++ b/src/3rdparty/BLAKE3/blake3_impl.h
@ -1,282 +0,0 @@
-#ifndef BLAKE3_IMPL_H
-#define BLAKE3_IMPL_H
-
-#include <assert.h>
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "blake3.h"
-
-// internal flags
-enum blake3_flags {
-  CHUNK_START         = 1 << 0,
-  CHUNK_END           = 1 << 1,
-  PARENT              = 1 << 2,
-  ROOT                = 1 << 3,
-  KEYED_HASH          = 1 << 4,
-  DERIVE_KEY_CONTEXT  = 1 << 5,
-  DERIVE_KEY_MATERIAL = 1 << 6,
-};
-
-// This C implementation tries to support recent versions of GCC, Clang, and
-// MSVC.
-#if defined(_MSC_VER)
-#define INLINE static __forceinline
-#else
-#define INLINE static inline __attribute__((always_inline))
-#endif
-
-#if defined(__x86_64__) || defined(_M_X64) 
-#define IS_X86
-#define IS_X86_64
-#endif
-
-#if defined(__i386__) || defined(_M_IX86)
-#define IS_X86
-#define IS_X86_32
-#endif
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-#define IS_AARCH64
-#endif
-
-#if defined(IS_X86)
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-#include <immintrin.h>
-#endif
-
-#if !defined(BLAKE3_USE_NEON) 
-  // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
-  #if defined(IS_AARCH64)
-    #define BLAKE3_USE_NEON 1
-  #else
-    #define BLAKE3_USE_NEON 0
-  #endif
-#endif
-
-#if defined(IS_X86)
-#define MAX_SIMD_DEGREE 16
-#elif BLAKE3_USE_NEON == 1
-#define MAX_SIMD_DEGREE 4
-#else
-#define MAX_SIMD_DEGREE 1
-#endif
-
-// There are some places where we want a static size that's equal to the
-// MAX_SIMD_DEGREE, but also at least 2.
-#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
-
-static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
-                               0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
-                               0x1F83D9ABUL, 0x5BE0CD19UL};
-
-static const uint8_t MSG_SCHEDULE[7][16] = {
-    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-    {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
-    {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
-    {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
-    {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
-    {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
-    {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
-};
-
-/* Find index of the highest set bit */
-/* x is assumed to be nonzero.       */
-static unsigned int highest_one(uint64_t x) {
-#if defined(__GNUC__) || defined(__clang__)
-  return 63 ^ __builtin_clzll(x);
-#elif defined(_MSC_VER) && defined(IS_X86_64)
-  unsigned long index;
-  _BitScanReverse64(&index, x);
-  return index;
-#elif defined(_MSC_VER) && defined(IS_X86_32)
-  if(x >> 32) {
-    unsigned long index;
-    _BitScanReverse(&index, (unsigned long)(x >> 32));
-    return 32 + index;
-  } else {
-    unsigned long index;
-    _BitScanReverse(&index, (unsigned long)x);
-    return index;
-  }
-#else
-  unsigned int c = 0;
-  if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
-  if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
-  if(x & 0x000000000000ff00ULL) { x >>=  8; c +=  8; }
-  if(x & 0x00000000000000f0ULL) { x >>=  4; c +=  4; }
-  if(x & 0x000000000000000cULL) { x >>=  2; c +=  2; }
-  if(x & 0x0000000000000002ULL) {           c +=  1; }
-  return c;
-#endif
-}
-
-// Count the number of 1 bits.
-INLINE unsigned int popcnt(uint64_t x) {
-#if defined(__GNUC__) || defined(__clang__)
-  return __builtin_popcountll(x);
-#else
-  unsigned int count = 0;
-  while (x != 0) {
-    count += 1;
-    x &= x - 1;
-  }
-  return count;
-#endif
-}
-
-// Largest power of two less than or equal to x. As a special case, returns 1
-// when x is 0. 
-INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
-  return 1ULL << highest_one(x | 1);
-}
-
-INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
-
-INLINE uint32_t counter_high(uint64_t counter) {
-  return (uint32_t)(counter >> 32);
-}
-
-INLINE uint32_t load32(const void *src) {
-  const uint8_t *p = (const uint8_t *)src;
-  return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
-         ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
-}
-
-INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
-                           uint32_t key_words[8]) {
-  key_words[0] = load32(&key[0 * 4]);
-  key_words[1] = load32(&key[1 * 4]);
-  key_words[2] = load32(&key[2 * 4]);
-  key_words[3] = load32(&key[3 * 4]);
-  key_words[4] = load32(&key[4 * 4]);
-  key_words[5] = load32(&key[5 * 4]);
-  key_words[6] = load32(&key[6 * 4]);
-  key_words[7] = load32(&key[7 * 4]);
-}
-
-INLINE void store32(void *dst, uint32_t w) {
-  uint8_t *p = (uint8_t *)dst;
-  p[0] = (uint8_t)(w >> 0);
-  p[1] = (uint8_t)(w >> 8);
-  p[2] = (uint8_t)(w >> 16);
-  p[3] = (uint8_t)(w >> 24);
-}
-
-INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
-  store32(&bytes_out[0 * 4], cv_words[0]);
-  store32(&bytes_out[1 * 4], cv_words[1]);
-  store32(&bytes_out[2 * 4], cv_words[2]);
-  store32(&bytes_out[3 * 4], cv_words[3]);
-  store32(&bytes_out[4 * 4], cv_words[4]);
-  store32(&bytes_out[5 * 4], cv_words[5]);
-  store32(&bytes_out[6 * 4], cv_words[6]);
-  store32(&bytes_out[7 * 4], cv_words[7]);
-}
-
-void blake3_compress_in_place(uint32_t cv[8],
-                              const uint8_t block[BLAKE3_BLOCK_LEN],
-                              uint8_t block_len, uint64_t counter,
-                              uint8_t flags);
-
-void blake3_compress_xof(const uint32_t cv[8],
-                         const uint8_t block[BLAKE3_BLOCK_LEN],
-                         uint8_t block_len, uint64_t counter, uint8_t flags,
-                         uint8_t out[64]);
-
-void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
-                      size_t blocks, const uint32_t key[8], uint64_t counter,
-                      bool increment_counter, uint8_t flags,
-                      uint8_t flags_start, uint8_t flags_end, uint8_t *out);
-
-size_t blake3_simd_degree(void);
-
-
-// Declarations for implementation-specific functions.
-void blake3_compress_in_place_portable(uint32_t cv[8],
-                                       const uint8_t block[BLAKE3_BLOCK_LEN],
-                                       uint8_t block_len, uint64_t counter,
-                                       uint8_t flags);
-
-void blake3_compress_xof_portable(const uint32_t cv[8],
-                                  const uint8_t block[BLAKE3_BLOCK_LEN],
-                                  uint8_t block_len, uint64_t counter,
-                                  uint8_t flags, uint8_t out[64]);
-
-void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
-                               size_t blocks, const uint32_t key[8],
-                               uint64_t counter, bool increment_counter,
-                               uint8_t flags, uint8_t flags_start,
-                               uint8_t flags_end, uint8_t *out);
-
-#if defined(IS_X86)
-#if !defined(BLAKE3_NO_SSE2)
-void blake3_compress_in_place_sse2(uint32_t cv[8],
-                                   const uint8_t block[BLAKE3_BLOCK_LEN],
-                                   uint8_t block_len, uint64_t counter,
-                                   uint8_t flags);
-void blake3_compress_xof_sse2(const uint32_t cv[8],
-                              const uint8_t block[BLAKE3_BLOCK_LEN],
-                              uint8_t block_len, uint64_t counter,
-                              uint8_t flags, uint8_t out[64]);
-void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
-                           size_t blocks, const uint32_t key[8],
-                           uint64_t counter, bool increment_counter,
-                           uint8_t flags, uint8_t flags_start,
-                           uint8_t flags_end, uint8_t *out);
-#endif
-#if !defined(BLAKE3_NO_SSE41)
-void blake3_compress_in_place_sse41(uint32_t cv[8],
-                                    const uint8_t block[BLAKE3_BLOCK_LEN],
-                                    uint8_t block_len, uint64_t counter,
-                                    uint8_t flags);
-void blake3_compress_xof_sse41(const uint32_t cv[8],
-                               const uint8_t block[BLAKE3_BLOCK_LEN],
-                               uint8_t block_len, uint64_t counter,
-                               uint8_t flags, uint8_t out[64]);
-void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
-                            size_t blocks, const uint32_t key[8],
-                            uint64_t counter, bool increment_counter,
-                            uint8_t flags, uint8_t flags_start,
-                            uint8_t flags_end, uint8_t *out);
-#endif
-#if !defined(BLAKE3_NO_AVX2)
-void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
-                           size_t blocks, const uint32_t key[8],
-                           uint64_t counter, bool increment_counter,
-                           uint8_t flags, uint8_t flags_start,
-                           uint8_t flags_end, uint8_t *out);
-#endif
-#if !defined(BLAKE3_NO_AVX512)
-void blake3_compress_in_place_avx512(uint32_t cv[8],
-                                     const uint8_t block[BLAKE3_BLOCK_LEN],
-                                     uint8_t block_len, uint64_t counter,
-                                     uint8_t flags);
-
-void blake3_compress_xof_avx512(const uint32_t cv[8],
-                                const uint8_t block[BLAKE3_BLOCK_LEN],
-                                uint8_t block_len, uint64_t counter,
-                                uint8_t flags, uint8_t out[64]);
-
-void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
-                             size_t blocks, const uint32_t key[8],
-                             uint64_t counter, bool increment_counter,
-                             uint8_t flags, uint8_t flags_start,
-                             uint8_t flags_end, uint8_t *out);
-#endif
-#endif
-
-#if BLAKE3_USE_NEON == 1
-void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
-                           size_t blocks, const uint32_t key[8],
-                           uint64_t counter, bool increment_counter,
-                           uint8_t flags, uint8_t flags_start,
-                           uint8_t flags_end, uint8_t *out);
-#endif
-
-
-#endif /* BLAKE3_IMPL_H */
--- a/src/3rdparty/BLAKE3/blake3_portable.c
+++ b/src/3rdparty/BLAKE3/blake3_portable.c
@ -1,160 +0,0 @@
-#include "blake3_impl.h"
-#include <string.h>
-
-INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
-  return (w >> c) | (w << (32 - c));
-}
-
-INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
-              uint32_t x, uint32_t y) {
-  state[a] = state[a] + state[b] + x;
-  state[d] = rotr32(state[d] ^ state[a], 16);
-  state[c] = state[c] + state[d];
-  state[b] = rotr32(state[b] ^ state[c], 12);
-  state[a] = state[a] + state[b] + y;
-  state[d] = rotr32(state[d] ^ state[a], 8);
-  state[c] = state[c] + state[d];
-  state[b] = rotr32(state[b] ^ state[c], 7);
-}
-
-INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
-  // Select the message schedule based on the round.
-  const uint8_t *schedule = MSG_SCHEDULE[round];
-
-  // Mix the columns.
-  g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
-  g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
-  g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
-  g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
-
-  // Mix the rows.
-  g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
-  g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
-  g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
-  g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
-}
-
-INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
-                         const uint8_t block[BLAKE3_BLOCK_LEN],
-                         uint8_t block_len, uint64_t counter, uint8_t flags) {
-  uint32_t block_words[16];
-  block_words[0] = load32(block + 4 * 0);
-  block_words[1] = load32(block + 4 * 1);
-  block_words[2] = load32(block + 4 * 2);
-  block_words[3] = load32(block + 4 * 3);
-  block_words[4] = load32(block + 4 * 4);
-  block_words[5] = load32(block + 4 * 5);
-  block_words[6] = load32(block + 4 * 6);
-  block_words[7] = load32(block + 4 * 7);
-  block_words[8] = load32(block + 4 * 8);
-  block_words[9] = load32(block + 4 * 9);
-  block_words[10] = load32(block + 4 * 10);
-  block_words[11] = load32(block + 4 * 11);
-  block_words[12] = load32(block + 4 * 12);
-  block_words[13] = load32(block + 4 * 13);
-  block_words[14] = load32(block + 4 * 14);
-  block_words[15] = load32(block + 4 * 15);
-
-  state[0] = cv[0];
-  state[1] = cv[1];
-  state[2] = cv[2];
-  state[3] = cv[3];
-  state[4] = cv[4];
-  state[5] = cv[5];
-  state[6] = cv[6];
-  state[7] = cv[7];
-  state[8] = IV[0];
-  state[9] = IV[1];
-  state[10] = IV[2];
-  state[11] = IV[3];
-  state[12] = counter_low(counter);
-  state[13] = counter_high(counter);
-  state[14] = (uint32_t)block_len;
-  state[15] = (uint32_t)flags;
-
-  round_fn(state, &block_words[0], 0);
-  round_fn(state, &block_words[0], 1);
-  round_fn(state, &block_words[0], 2);
-  round_fn(state, &block_words[0], 3);
-  round_fn(state, &block_words[0], 4);
-  round_fn(state, &block_words[0], 5);
-  round_fn(state, &block_words[0], 6);
-}
-
-void blake3_compress_in_place_portable(uint32_t cv[8],
-                                       const uint8_t block[BLAKE3_BLOCK_LEN],
-                                       uint8_t block_len, uint64_t counter,
-                                       uint8_t flags) {
-  uint32_t state[16];
-  compress_pre(state, cv, block, block_len, counter, flags);
-  cv[0] = state[0] ^ state[8];
-  cv[1] = state[1] ^ state[9];
-  cv[2] = state[2] ^ state[10];
-  cv[3] = state[3] ^ state[11];
-  cv[4] = state[4] ^ state[12];
-  cv[5] = state[5] ^ state[13];
-  cv[6] = state[6] ^ state[14];
-  cv[7] = state[7] ^ state[15];
-}
-
-void blake3_compress_xof_portable(const uint32_t cv[8],
-                                  const uint8_t block[BLAKE3_BLOCK_LEN],
-                                  uint8_t block_len, uint64_t counter,
-                                  uint8_t flags, uint8_t out[64]) {
-  uint32_t state[16];
-  compress_pre(state, cv, block, block_len, counter, flags);
-
-  store32(&out[0 * 4], state[0] ^ state[8]);
-  store32(&out[1 * 4], state[1] ^ state[9]);
-  store32(&out[2 * 4], state[2] ^ state[10]);
-  store32(&out[3 * 4], state[3] ^ state[11]);
-  store32(&out[4 * 4], state[4] ^ state[12]);
-  store32(&out[5 * 4], state[5] ^ state[13]);
-  store32(&out[6 * 4], state[6] ^ state[14]);
-  store32(&out[7 * 4], state[7] ^ state[15]);
-  store32(&out[8 * 4], state[8] ^ cv[0]);
-  store32(&out[9 * 4], state[9] ^ cv[1]);
-  store32(&out[10 * 4], state[10] ^ cv[2]);
-  store32(&out[11 * 4], state[11] ^ cv[3]);
-  store32(&out[12 * 4], state[12] ^ cv[4]);
-  store32(&out[13 * 4], state[13] ^ cv[5]);
-  store32(&out[14 * 4], state[14] ^ cv[6]);
-  store32(&out[15 * 4], state[15] ^ cv[7]);
-}
-
-INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
-                              const uint32_t key[8], uint64_t counter,
-                              uint8_t flags, uint8_t flags_start,
-                              uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
-  uint32_t cv[8];
-  memcpy(cv, key, BLAKE3_KEY_LEN);
-  uint8_t block_flags = flags | flags_start;
-  while (blocks > 0) {
-    if (blocks == 1) {
-      block_flags |= flags_end;
-    }
-    blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
-                                      block_flags);
-    input = &input[BLAKE3_BLOCK_LEN];
-    blocks -= 1;
-    block_flags = flags;
-  }
-  store_cv_words(out, cv);
-}
-
-void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
-                               size_t blocks, const uint32_t key[8],
-                               uint64_t counter, bool increment_counter,
-                               uint8_t flags, uint8_t flags_start,
-                               uint8_t flags_end, uint8_t *out) {
-  while (num_inputs > 0) {
-    hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
-                      flags_end, out);
-    if (increment_counter) {
-      counter += 1;
-    }
-    inputs += 1;
-    num_inputs -= 1;
-    out = &out[BLAKE3_OUT_LEN];
-  }
-}
--- a/src/3rdparty/xxHash/LICENSE
+++ b/src/3rdparty/xxHash/LICENSE
@ -0,0 +1,26 @@
+xxHash Library
+Copyright (c) 2012-2021 Yann Collet
+All rights reserved.
+
+BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/src/3rdparty/xxHash/NOTE
+++ b/src/3rdparty/xxHash/NOTE
@ -0,0 +1,2 @@
+This is Git checkout aa7ac8bd1f31172937069efe7557600593ffe871
+from https://github.com/Cyan4973/xxHash that has not been modified.
--- a/src/3rdparty/xxHash/README.md
+++ b/src/3rdparty/xxHash/README.md
@ -0,0 +1,263 @@
+
+xxHash - Extremely fast hash algorithm
+======================================
+
+xxHash is an Extremely fast Hash algorithm, running at RAM speed limits.
+It successfully completes the [SMHasher](https://code.google.com/p/smhasher/wiki/SMHasher) test suite
+which evaluates collision, dispersion and randomness qualities of hash functions.
+Code is highly portable, and hashes are identical across all platforms (little / big endian).
+
+|Branch      |Status   |
+|------------|---------|
+|release     | [![Build Status](https://github.com/Cyan4973/xxHash/actions/workflows/ci.yml/badge.svg?branch=release)](https://github.com/Cyan4973/xxHash/actions?query=branch%3Arelease+) |
+|dev         | [![Build Status](https://github.com/Cyan4973/xxHash/actions/workflows/ci.yml/badge.svg?branch=dev)](https://github.com/Cyan4973/xxHash/actions?query=branch%3Adev+) |
+
+
+Benchmarks
+-------------------------
+
+The reference system uses an Intel i7-9700K cpu, and runs Ubuntu x64 20.04.
+The [open source benchmark program] is compiled with `clang` v10.0 using `-O3` flag.
+
+| Hash Name     | Width | Bandwidth (GB/s) | Small Data Velocity | Quality | Comment |
+| ---------     | ----- | ---------------- | ----- | --- | --- |
+| __XXH3__ (SSE2) |  64 | 31.5 GB/s        | 133.1 | 10
+| __XXH128__ (SSE2) | 128 | 29.6 GB/s      | 118.1 | 10
+| _RAM sequential read_ | N/A | 28.0 GB/s  |   N/A | N/A | _for reference_
+| City64        |    64 | 22.0 GB/s        |  76.6 | 10
+| T1ha2         |    64 | 22.0 GB/s        |  99.0 |  9 | Slightly worse [collisions]
+| City128       |   128 | 21.7 GB/s        |  57.7 | 10
+| __XXH64__     |    64 | 19.4 GB/s        |  71.0 | 10
+| SpookyHash    |    64 | 19.3 GB/s        |  53.2 | 10
+| Mum           |    64 | 18.0 GB/s        |  67.0 |  9 | Slightly worse [collisions]
+| __XXH32__     |    32 |  9.7 GB/s        |  71.9 | 10
+| City32        |    32 |  9.1 GB/s        |  66.0 | 10
+| Murmur3       |    32 |  3.9 GB/s        |  56.1 | 10
+| SipHash       |    64 |  3.0 GB/s        |  43.2 | 10
+| FNV64         |    64 |  1.2 GB/s        |  62.7 |  5 | Poor avalanche properties
+| Blake2        |   256 |  1.1 GB/s        |   5.1 | 10 | Cryptographic
+| SHA1          |   160 |  0.8 GB/s        |   5.6 | 10 | Cryptographic but broken
+| MD5           |   128 |  0.6 GB/s        |   7.8 | 10 | Cryptographic but broken
+
+[open source benchmark program]: https://github.com/Cyan4973/xxHash/tree/release/tests/bench
+[collisions]: https://github.com/Cyan4973/xxHash/wiki/Collision-ratio-comparison#collision-study
+
+note 1: Small data velocity is a _rough_ evaluation of algorithm's efficiency on small data. For more detailed analysis, please refer to next paragraph.
+
+note 2: some algorithms feature _faster than RAM_ speed. In which case, they can only reach their full speed when input data is already in CPU cache (L3 or better). Otherwise, they max out on RAM speed limit.
+
+### Small data
+
+Performance on large data is only one part of the picture.
+Hashing is also very useful in constructions like hash tables and bloom filters.
+In these use cases, it's frequent to hash a lot of small data (starting at a few bytes).
+Algorithm's performance can be very different for such scenarios, since parts of the algorithm,
+such as initialization or finalization, become fixed cost.
+The impact of branch mis-prediction also becomes much more present.
+
+XXH3 has been designed for excellent performance on both long and small inputs,
+which can be observed in the following graph:
+
+![XXH3, latency, random size](https://user-images.githubusercontent.com/750081/61976089-aedeab00-af9f-11e9-9239-e5375d6c080f.png)
+
+For a more detailed analysis, visit the wiki :
+https://github.com/Cyan4973/xxHash/wiki/Performance-comparison#benchmarks-concentrating-on-small-data-
+
+Quality
+-------------------------
+
+Speed is not the only property that matters.
+Produced hash values must respect excellent dispersion and randomness properties,
+so that any sub-section of it can be used to maximally spread out a table or index,
+as well as reduce the amount of collisions to the minimal theoretical level, following the [birthday paradox].
+
+`xxHash` has been tested with Austin Appleby's excellent SMHasher test suite,
+and passes all tests, ensuring reasonable quality levels.
+It also passes extended tests from [newer forks of SMHasher], featuring additional scenarios and conditions.
+
+Finally, xxHash provides its own [massive collision tester](https://github.com/Cyan4973/xxHash/tree/dev/tests/collisions),
+able to generate and compare billions of hashes to test the limits of 64-bit hash algorithms.
+On this front too, xxHash features good results, in line with the [birthday paradox].
+A more detailed analysis is documented [in the wiki](https://github.com/Cyan4973/xxHash/wiki/Collision-ratio-comparison).
+
+[birthday paradox]: https://en.wikipedia.org/wiki/Birthday_problem
+[newer forks of SMHasher]: https://github.com/rurban/smhasher
+
+
+### Build modifiers
+
+The following macros can be set at compilation time to modify libxxhash's behavior. They are generally disabled by default.
+
+- `XXH_INLINE_ALL`: Make all functions `inline`, with implementations being directly included within `xxhash.h`.
+                    Inlining functions is beneficial for speed on small keys.
+                    It's _extremely effective_ when key length is expressed as _a compile time constant_,
+                    with performance improvements observed in the +200% range .
+                    See [this article](https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html) for details.
+- `XXH_PRIVATE_API`: same outcome as `XXH_INLINE_ALL`. Still available for legacy support.
+                     The name underlines that `XXH_*` symbols will not be exported.
+- `XXH_NAMESPACE`: Prefixes all symbols with the value of `XXH_NAMESPACE`.
+                   This macro can only use compilable character set.
+                   Useful to evade symbol naming collisions,
+                   in case of multiple inclusions of xxHash's source code.
+                   Client applications still use the regular function names,
+                   as symbols are automatically translated through `xxhash.h`.
+- `XXH_FORCE_MEMORY_ACCESS`: The default method `0` uses a portable `memcpy()` notation.
+                             Method `1` uses a gcc-specific `packed` attribute, which can provide better performance for some targets.
+                             Method `2` forces unaligned reads, which is not standards compliant, but might sometimes be the only way to extract better read performance.
+                             Method `3` uses a byteshift operation, which is best for old compilers which don't inline `memcpy()` or big-endian systems without a byteswap instruction
+- `XXH_FORCE_ALIGN_CHECK`: Use a faster direct read path when input is aligned.
+                           This option can result in dramatic performance improvement when input to hash is aligned on 32 or 64-bit boundaries,
+                           when running on architectures unable to load memory from unaligned addresses, or suffering a performance penalty from it.
+                           It is (slightly) detrimental on platform with good unaligned memory access performance (same instruction for both aligned and unaligned accesses).
+                           This option is automatically disabled on `x86`, `x64` and `aarch64`, and enabled on all other platforms.
+- `XXH_VECTOR` : manually select a vector instruction set (default: auto-selected at compilation time). Available instruction sets are `XXH_SCALAR`, `XXH_SSE2`, `XXH_AVX2`, `XXH_AVX512`, `XXH_NEON` and `XXH_VSX`. Compiler may require additional flags to ensure proper support (for example, `gcc` on linux will require `-mavx2` for AVX2, and `-mavx512f` for AVX512).
+- `XXH_NO_PREFETCH` : disable prefetching. Some platforms or situations may perform better without prefetching. XXH3 only.
+- `XXH_PREFETCH_DIST` : select prefetching distance. For close-to-metal adaptation to specific hardware platforms. XXH3 only.
+- `XXH_NO_STREAM`: Disables the streaming API, limiting it to single shot variants only.
+- `XXH_SIZE_OPT`: `0`: default, optimize for speed
+                  `1`: default for `-Os` and `-Oz`: disables some speed hacks for size optimization
+                  `2`: makes code as small as possible, performance may cry
+- `XXH_NO_INLINE_HINTS`: By default, xxHash uses `__attribute__((always_inline))` and `__forceinline` to improve performance at the cost of code size.
+                         Defining this macro to 1 will mark all internal functions as `static`, allowing the compiler to decide whether to inline a function or not.
+                         This is very useful when optimizing for smallest binary size,
+                         and is automatically defined when compiling with `-O0`, `-Os`, `-Oz`, or `-fno-inline` on GCC and Clang.
+                         This may also increase performance depending on compiler and architecture.
+- `XXH32_ENDJMP`: Switch multi-branch finalization stage of XXH32 by a single jump.
+                  This is generally undesirable for performance, especially when hashing inputs of random sizes.
+                  But depending on exact architecture and compiler, a jump might provide slightly better performance on small inputs. Disabled by default.
+- `XXH_NO_STDLIB`: Disable invocation of `<stdlib.h>` functions, notably `malloc()` and `free()`.
+                   `libxxhash`'s `XXH*_createState()` will always fail and return `NULL`.
+                   But one-shot hashing (like `XXH32()`) or streaming using statically allocated states
+                   still work as expected.
+                   This build flag is useful for embedded environments without dynamic allocation.
+- `XXH_STATIC_LINKING_ONLY`: gives access to internal state declaration, required for static allocation.
+                             Incompatible with dynamic linking, due to risks of ABI changes.
+- `XXH_NO_XXH3` : removes symbols related to `XXH3` (both 64 & 128 bits) from generated binary.
+                  Useful to reduce binary size, notably for applications which do not use `XXH3`.
+- `XXH_NO_LONG_LONG`: removes compilation of algorithms relying on 64-bit types (XXH3 and XXH64). Only XXH32 will be compiled.
+                      Useful for targets (architectures and compilers) without 64-bit support.
+- `XXH_IMPORT`: MSVC specific: should only be defined for dynamic linking, as it prevents linkage errors.
+- `XXH_CPU_LITTLE_ENDIAN`: By default, endianness is determined by a runtime test resolved at compile time.
+                           If, for some reason, the compiler cannot simplify the runtime test, it can cost performance.
+                           It's possible to skip auto-detection and simply state that the architecture is little-endian by setting this macro to 1.
+                           Setting it to 0 states big-endian.
+- `XXH_DEBUGLEVEL` : When set to any value >= 1, enables `assert()` statements.
+                     This (slightly) slows down execution, but may help finding bugs during debugging sessions.
+
+When compiling the Command Line Interface `xxhsum` with `make`, the following environment variables can also be set :
+- `DISPATCH=1` : use `xxh_x86dispatch.c`, to automatically select between `scalar`, `sse2`, `avx2` or `avx512` instruction set at runtime, depending on local host. This option is only valid for `x86`/`x64` systems.
+
+### Building xxHash - Using vcpkg
+
+You can download and install xxHash using the [vcpkg](https://github.com/Microsoft/vcpkg) dependency manager:
+
+    git clone https://github.com/Microsoft/vcpkg.git
+    cd vcpkg
+    ./bootstrap-vcpkg.sh
+    ./vcpkg integrate install
+    ./vcpkg install xxhash
+
+The xxHash port in vcpkg is kept up to date by Microsoft team members and community contributors. If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.
+
+### Building and Using xxHash - tipi.build
+
+You can work on xxHash and depend on it in your [tipi.build](https://tipi.build) projects by adding the following entry to your `.tipi/deps`:
+
+```json
+{
+    "Cyan4973/xxHash": { "@": "v0.8.1" }
+}
+```
+
+An example of such usage can be found in the `/cli` folder of this project which, if built as root project will depend on the release `v0.8.1` of xxHash
+
+
+To contribute to xxHash itself use tipi.build on this repository (change the target name appropriately to `linux` or `macos` or `windows`):
+
+```bash
+tipi . -t <target> --test all
+```
+
+### Example
+
+The simplest example calls xxhash 64-bit variant as a one-shot function
+generating a hash value from a single buffer, and invoked from a C/C++ program:
+
+```C
+#include "xxhash.h"
+
+    (...)
+    XXH64_hash_t hash = XXH64(buffer, size, seed);
+}
+```
+
+Streaming variant is more involved, but makes it possible to provide data incrementally:
+
+```C
+#include "stdlib.h"   /* abort() */
+#include "xxhash.h"
+
+
+XXH64_hash_t calcul_hash_streaming(FileHandler fh)
+{
+    /* create a hash state */
+    XXH64_state_t* const state = XXH64_createState();
+    if (state==NULL) abort();
+
+    size_t const bufferSize = SOME_SIZE;
+    void* const buffer = malloc(bufferSize);
+    if (buffer==NULL) abort();
+
+    /* Initialize state with selected seed */
+    XXH64_hash_t const seed = 0;   /* or any other value */
+    if (XXH64_reset(state, seed) == XXH_ERROR) abort();
+
+    /* Feed the state with input data, any size, any number of times */
+    (...)
+    while ( /* some data left */ ) {
+        size_t const length = get_more_data(buffer, bufferSize, fh);
+        if (XXH64_update(state, buffer, length) == XXH_ERROR) abort();
+        (...)
+    }
+    (...)
+
+    /* Produce the final hash value */
+    XXH64_hash_t const hash = XXH64_digest(state);
+
+    /* State could be re-used; but in this example, it is simply freed  */
+    free(buffer);
+    XXH64_freeState(state);
+
+    return hash;
+}
+```
+
+
+### License
+
+The library files `xxhash.c` and `xxhash.h` are BSD licensed.
+The utility `xxhsum` is GPL licensed.
+
+
+### Other programming languages
+
+Beyond the C reference version,
+xxHash is also available from many different programming languages,
+thanks to great contributors.
+They are [listed here](http://www.xxhash.com/#other-languages).
+
+
+### Packaging status
+
+Many distributions bundle a package manager
+which allows easy xxhash installation as both a `libxxhash` library
+and `xxhsum` command line interface.
+
+[![Packaging status](https://repology.org/badge/vertical-allrepos/xxhash.svg)](https://repology.org/project/xxhash/versions)
+
+
+### Special Thanks
+
+- Takayuki Matsuoka, aka @t-mat, for creating `xxhsum -c` and great support during early xxh releases
+- Mathias Westerdahl, aka @JCash, for introducing the first version of `XXH64`
+- Devin Hussey, aka @easyaspi314, for incredible low-level optimizations on `XXH3` and `XXH128`
--- a/src/3rdparty/xxHash/xxhash.h
+++ b/src/3rdparty/xxHash/xxhash.h
--- a/src/network/CMakeLists.txt
+++ b/src/network/CMakeLists.txt
@ -20,7 +20,7 @@ include_directories(
    ${CMAKE_BINARY_DIR}/include/QtCore
    ${CMAKE_BINARY_DIR}/include/QtNetwork
    ${CMAKE_SOURCE_DIR}/src/3rdparty/digest
-    ${CMAKE_SOURCE_DIR}/src/3rdparty/BLAKE3
+    ${CMAKE_SOURCE_DIR}/src/3rdparty/xxHash
 )

 set(NETWORK_HEADERS
@ -65,9 +65,6 @@ set(NETWORK_SOURCES
    ${CMAKE_SOURCE_DIR}/src/3rdparty/digest/md5c.c
    ${CMAKE_SOURCE_DIR}/src/3rdparty/digest/sha1.c
    ${CMAKE_SOURCE_DIR}/src/3rdparty/digest/sha2.c
-    ${CMAKE_SOURCE_DIR}/src/3rdparty/BLAKE3/blake3.c
-    ${CMAKE_SOURCE_DIR}/src/3rdparty/BLAKE3/blake3_dispatch.c
-    ${CMAKE_SOURCE_DIR}/src/3rdparty/BLAKE3/blake3_portable.c
 )

 katie_generate_misc("${NETWORK_HEADERS}" QtNetwork)
--- a/src/network/kernel/qcryptographichash.cpp
+++ b/src/network/kernel/qcryptographichash.cpp
@ -27,7 +27,8 @@
 #include "md5.h"
 #include "sha1.h"
 #include "sha2.h"
-#include "blake3.h"
+#define XXH_INLINE_ALL
+#include "xxhash.h"

 QT_BEGIN_NAMESPACE

@ -35,22 +36,32 @@ class QCryptographicHashPrivate
 {
 public:
    QCryptographicHashPrivate(const QCryptographicHash::Algorithm amethod);
+    ~QCryptographicHashPrivate();

    MD5_CTX md5Context;
    SHA1_CTX sha1Context;
    SHA256_CTX sha256Context;
    SHA512_CTX sha512Context;
-    blake3_hasher blake3Context;
+    XXH3_state_t* xxh3Context;
+    XXH3_state_t* xxh3Context2;
    bool rehash;
    const QCryptographicHash::Algorithm method;
 };

 QCryptographicHashPrivate::QCryptographicHashPrivate(const QCryptographicHash::Algorithm amethod)
-    : rehash(false),
+    : xxh3Context(nullptr),
+    xxh3Context2(nullptr),
+    rehash(false),
    method(amethod)
 {
 }

+QCryptographicHashPrivate::~QCryptographicHashPrivate()
+{
+    XXH3_freeState(xxh3Context);
+    XXH3_freeState(xxh3Context2);
+}
+
 /*!
    \class QCryptographicHash
    \inmodule QtCore
@ -66,7 +77,7 @@ QCryptographicHashPrivate::QCryptographicHashPrivate(const QCryptographicHash::A
    QCryptographicHash can be used to generate cryptographic hashes of binary
    or text data.

-    Currently MD5, SHA-1, SHA-256, SHA-512 and BLAKE3 are supported.
+    Currently MD5, SHA-1, SHA-256, SHA-512 and custom one are supported.
 */

 /*!
@ -76,7 +87,7 @@ QCryptographicHashPrivate::QCryptographicHashPrivate(const QCryptographicHash::A
    \value Sha1 Generate an SHA-1 hash sum
    \value Sha256 Generate an SHA-256 hash sum (SHA-2). Introduced in Katie 4.9
    \value Sha512 Generate an SHA-512 hash sum (SHA-2). Introduced in Katie 4.9
-    \value BLAKE3 Generate an BLAKE3 hash sum. Introduced in Katie 4.12
+    \value KAT Generate an custom 256-bit hash sum. Introduced in Katie 4.12
 */

 /*!
@ -120,8 +131,15 @@ void QCryptographicHash::reset()
            SHA512_Init(&d->sha512Context);
            break;
        }
-        case QCryptographicHash::BLAKE3: {
-            blake3_hasher_init(&d->blake3Context);
+        case QCryptographicHash::KAT: {
+            if (!d->xxh3Context) {
+                d->xxh3Context = XXH3_createState();
+            }
+            if (!d->xxh3Context2) {
+                d->xxh3Context2 = XXH3_createState();
+            }
+            XXH3_128bits_reset(d->xxh3Context);
+            XXH3_128bits_reset(d->xxh3Context2);
            break;
        }
    }
@ -150,8 +168,11 @@ void QCryptographicHash::addData(const char *data, int length)
            SHA512_Update(&d->sha512Context, reinterpret_cast<const uchar*>(data), length);
            break;
        }
-        case QCryptographicHash::BLAKE3: {
-            blake3_hasher_update(&d->blake3Context, reinterpret_cast<const uchar*>(data), length);
+        case QCryptographicHash::KAT: {
+            XXH3_128bits_update(d->xxh3Context, data, length);
+            if (Q_LIKELY(length > 1)) {
+                XXH3_128bits_update(d->xxh3Context2, data, length / 2);
+            }
            break;
        }
    }
@ -215,10 +236,14 @@ QByteArray QCryptographicHash::result() const
            SHA512_Final(result, &copy);
            return QByteArray(reinterpret_cast<char *>(result), SHA512_DIGEST_LENGTH);
        }
-        case QCryptographicHash::BLAKE3: {
-            QSTACKARRAY(uint8_t, result, BLAKE3_OUT_LEN);
-            blake3_hasher_finalize(&d->blake3Context, result, BLAKE3_OUT_LEN);
-            return QByteArray(reinterpret_cast<char *>(result), BLAKE3_OUT_LEN);
+        case QCryptographicHash::KAT: {
+            const XXH128_hash_t xxresult = XXH3_128bits_digest(d->xxh3Context);
+            XXH128_canonical_t xxcanonical;
+            XXH128_canonicalFromHash(&xxcanonical, xxresult);
+            const XXH128_hash_t xxresult2 = XXH3_128bits_digest(d->xxh3Context2);
+            XXH128_canonical_t xxcanonical2;
+            XXH128_canonicalFromHash(&xxcanonical2, xxresult2);
+            return (QByteArray(reinterpret_cast<char *>(xxcanonical.digest), sizeof(XXH128_hash_t)) + QByteArray(reinterpret_cast<char *>(xxcanonical2.digest), sizeof(XXH128_hash_t)));
        }
    }

@ -263,13 +288,19 @@ QByteArray QCryptographicHash::hash(const QByteArray &data, QCryptographicHash::
            SHA512_Final(result, &sha512Context);
            return QByteArray(reinterpret_cast<char *>(result), SHA512_DIGEST_LENGTH);
        }
-        case QCryptographicHash::BLAKE3: {
-            QSTACKARRAY(uint8_t, result, BLAKE3_OUT_LEN);
-            blake3_hasher blake3Context;
-            blake3_hasher_init(&blake3Context);
-            blake3_hasher_update(&blake3Context, reinterpret_cast<const uchar*>(data.constData()), data.length());
-            blake3_hasher_finalize(&blake3Context, result, BLAKE3_OUT_LEN);
-            return QByteArray(reinterpret_cast<char *>(result), BLAKE3_OUT_LEN);
+        case QCryptographicHash::KAT: {
+            const XXH128_hash_t xxresult = XXH3_128bits(data.constData(), data.length());
+            XXH128_canonical_t xxcanonical;
+            XXH128_canonicalFromHash(&xxcanonical, xxresult);
+            XXH128_hash_t xxresult2;
+            if (Q_LIKELY(data.length() > 1)) {
+                xxresult2 = XXH3_128bits(data.constData(), data.length() / 2);
+            } else {
+                xxresult2 = XXH3_128bits("KATIE", 5);
+            }
+            XXH128_canonical_t xxcanonical2;
+            XXH128_canonicalFromHash(&xxcanonical2, xxresult2);
+            return (QByteArray(reinterpret_cast<char *>(xxcanonical.digest), sizeof(XXH128_hash_t)) + QByteArray(reinterpret_cast<char *>(xxcanonical2.digest), sizeof(XXH128_hash_t)));
        }
    }

--- a/src/network/kernel/qcryptographichash.h
+++ b/src/network/kernel/qcryptographichash.h
@ -39,7 +39,7 @@ public:
        Sha1,
        Sha256,
        Sha512,
-        BLAKE3
+        KAT
    };

    explicit QCryptographicHash(Algorithm method);
--- a/tests/auto/qcryptographichash/tst_qcryptographichash.cpp
+++ b/tests/auto/qcryptographichash/tst_qcryptographichash.cpp
@ -85,10 +85,10 @@ void tst_QCryptographicHash::intermediary_result_data()
                         << QByteArray("abc") << QByteArray("abc")
                         << QByteArray::fromHex("DDAF35A193617ABACC417349AE20413112E6FA4E89A97EA20A9EEEE64B55D39A2192992A274FC1A836BA3C23A3FEEBBD454D4423643CE80E2A9AC94FA54CA49F")
                         << QByteArray::fromHex("F3C41E7B63EE869596FC28BAD64120612C520F65928AB4D126C72C6998B551B8FF1CEDDFED4373E6717554DC89D1EEE6F0AB22FD3675E561ABA9AE26A3EEC53B");
-    QTest::newRow("BLAKE3") << int(QCryptographicHash::BLAKE3)
+    QTest::newRow("KAT") << int(QCryptographicHash::KAT)
                         << QByteArray("abc") << QByteArray("abc")
-                          << QByteArray::fromHex("6437b3ac38465133ffb63b75273a8db548c558465d79db03fd359c6cd5bd9d85")
-                          << QByteArray::fromHex("8a120b9472cc2c9873ec4283ff85376799f0864119c167440aeb7ec7a631fbca");
+                         << QByteArray::fromHex("06b05ab6733a618578af5f94892f3950a96faf705af16834e6c632b61e964e1f")
+                         << QByteArray::fromHex("3b3faa148c5f05f1b441a07dd428557bb9fe94d346d39b20369242a646a19333");
 }

 void tst_QCryptographicHash::intermediary_result()
--- a/tests/benchmarks/network/kernel/qcryptographichash/main.cpp
+++ b/tests/benchmarks/network/kernel/qcryptographichash/main.cpp
@ -49,25 +49,25 @@ void tst_qcryptographichash::append_data()
    QTest::newRow("10 (Sha1)")     << int(10)   << QCryptographicHash::Sha1;
    QTest::newRow("10 (Sha256)")   << int(10)   << QCryptographicHash::Sha256;
    QTest::newRow("10 (Sha512)")   << int(10)   << QCryptographicHash::Sha512;
-    QTest::newRow("10 (BLAKE3)")  << int(10)   << QCryptographicHash::BLAKE3;
+    QTest::newRow("10 (KAT)")      << int(10)   << QCryptographicHash::KAT;

    QTest::newRow("100 (Md5)")     << int(100)  << QCryptographicHash::Md5;
    QTest::newRow("100 (Sha1)")    << int(100)  << QCryptographicHash::Sha1;
    QTest::newRow("100 (Sha256)")  << int(100)  << QCryptographicHash::Sha256;
    QTest::newRow("100 (Sha512)")  << int(100)  << QCryptographicHash::Sha512;
-    QTest::newRow("100 (BLAKE3)") << int(100)  << QCryptographicHash::BLAKE3;
+    QTest::newRow("100 (KAT)")     << int(100)  << QCryptographicHash::KAT;

    QTest::newRow("250 (Md5)")     << int(250)  << QCryptographicHash::Md5;
    QTest::newRow("250 (Sha1)")    << int(250)  << QCryptographicHash::Sha1;
    QTest::newRow("250 (Sha256)")  << int(250)  << QCryptographicHash::Sha256;
    QTest::newRow("250 (Sha512)")  << int(250)  << QCryptographicHash::Sha512;
-    QTest::newRow("250 (BLAKE3)") << int(250)  << QCryptographicHash::BLAKE3;
+    QTest::newRow("250 (KAT)")     << int(250)  << QCryptographicHash::KAT;

    QTest::newRow("500 (Md5)")     << int(500)  << QCryptographicHash::Md5;
    QTest::newRow("500 (Sha1)")    << int(500)  << QCryptographicHash::Sha1;
    QTest::newRow("500 (Sha256)")  << int(500)  << QCryptographicHash::Sha256;
    QTest::newRow("500 (Sha512)")  << int(500)  << QCryptographicHash::Sha512;
-    QTest::newRow("500 (BLAKE3)") << int(500)  << QCryptographicHash::BLAKE3;
+    QTest::newRow("500 (KAT)")     << int(500)  << QCryptographicHash::KAT;
 }

 void tst_qcryptographichash::append()
@ -101,7 +101,7 @@ void tst_qcryptographichash::append_once_data()
    QTest::newRow("Sha1")   << QCryptographicHash::Sha1;
    QTest::newRow("Sha256") << QCryptographicHash::Sha256;
    QTest::newRow("Sha512") << QCryptographicHash::Sha512;
-    QTest::newRow("BLAKE3") << QCryptographicHash::BLAKE3;
+    QTest::newRow("KAT")    << QCryptographicHash::KAT;
 }

 void tst_qcryptographichash::append_once()
--- a/util/filehash/main.cpp
+++ b/util/filehash/main.cpp
@ -32,7 +32,7 @@ int main(int argc, char *argv[])

    const QStringList args = app.arguments().mid(1);
    if (args.size() != 2) {
-        qWarning() << "Usage: filehash <md5|sha1|sha256|sha512|blake3> <filepath>";
+        qWarning() << "Usage: filehash <md5|sha1|sha256|sha512|kat> <filepath>";
        return 1;
    }

@ -45,8 +45,8 @@ int main(int argc, char *argv[])
        algorithm = QCryptographicHash::Sha256;
    } else if (args.at(0) == "sha512") {
        algorithm = QCryptographicHash::Sha512;
-    } else if (args.at(0) == "blake3") {
-        algorithm = QCryptographicHash::BLAKE3;
+    } else if (args.at(0) == "kat") {
+        algorithm = QCryptographicHash::KAT;
    } else {
        qWarning() << "Invalid algorithm" << args.at(0);
        return 2;