[llvm] d2ad63a - [Support/BLAKE3] Make g_cpu_features thread safe (#147948)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Jul 12 00:02:59 PDT 2025
Author: Dmitry Vasilyev
Date: 2025-07-12T11:02:56+04:00
New Revision: d2ad63a193216d008c8161879a59c5f42e0125cc
URL: https://github.com/llvm/llvm-project/commit/d2ad63a193216d008c8161879a59c5f42e0125cc
DIFF: https://github.com/llvm/llvm-project/commit/d2ad63a193216d008c8161879a59c5f42e0125cc.diff
LOG: [Support/BLAKE3] Make g_cpu_features thread safe (#147948)
`g_cpu_features` can be updated multiple times by `get_cpu_features()`,
which reports a thread sanitizer error when used with multiple lld
threads.
This PR updates BLAKE3 to v1.8.2.
Added:
Modified:
llvm/include/llvm-c/blake3.h
llvm/lib/Support/BLAKE3/README.md
llvm/lib/Support/BLAKE3/blake3.c
llvm/lib/Support/BLAKE3/blake3_avx2.c
llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S
llvm/lib/Support/BLAKE3/blake3_avx512.c
llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_gnu.S
llvm/lib/Support/BLAKE3/blake3_dispatch.c
llvm/lib/Support/BLAKE3/blake3_impl.h
llvm/lib/Support/BLAKE3/blake3_neon.c
llvm/lib/Support/BLAKE3/blake3_sse2.c
llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_gnu.S
llvm/lib/Support/BLAKE3/blake3_sse41.c
llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_gnu.S
Removed:
################################################################################
diff --git a/llvm/include/llvm-c/blake3.h b/llvm/include/llvm-c/blake3.h
index ad3eb08464496..ba5df64bbab8c 100644
--- a/llvm/include/llvm-c/blake3.h
+++ b/llvm/include/llvm-c/blake3.h
@@ -25,7 +25,7 @@
extern "C" {
#endif
-#define LLVM_BLAKE3_VERSION_STRING "1.3.1"
+#define LLVM_BLAKE3_VERSION_STRING "1.8.2"
#define LLVM_BLAKE3_KEY_LEN 32
#define LLVM_BLAKE3_OUT_LEN 32
#define LLVM_BLAKE3_BLOCK_LEN 64
diff --git a/llvm/lib/Support/BLAKE3/README.md b/llvm/lib/Support/BLAKE3/README.md
index 319a7514e8b50..bf17502f53b10 100644
--- a/llvm/lib/Support/BLAKE3/README.md
+++ b/llvm/lib/Support/BLAKE3/README.md
@@ -1,4 +1,4 @@
-Implementation of BLAKE3, originating from https://github.com/BLAKE3-team/BLAKE3/tree/1.3.1/c
+Implementation of BLAKE3, originating from https://github.com/BLAKE3-team/BLAKE3/tree/1.8.2/c
# Example
diff --git a/llvm/lib/Support/BLAKE3/blake3.c b/llvm/lib/Support/BLAKE3/blake3.c
index 23f0252602de2..ec76dd7157ac4 100644
--- a/llvm/lib/Support/BLAKE3/blake3.c
+++ b/llvm/lib/Support/BLAKE3/blake3.c
@@ -95,24 +95,30 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
size_t out_len) {
+ if (out_len == 0) {
+ return;
+ }
uint64_t output_block_counter = seek / 64;
size_t offset_within_block = seek % 64;
uint8_t wide_buf[64];
- while (out_len > 0) {
- blake3_compress_xof(self->input_cv, self->block, self->block_len,
- output_block_counter, self->flags | ROOT, wide_buf);
- size_t available_bytes = 64 - offset_within_block;
- size_t memcpy_len;
- if (out_len > available_bytes) {
- memcpy_len = available_bytes;
- } else {
- memcpy_len = out_len;
- }
- memcpy(out, wide_buf + offset_within_block, memcpy_len);
- out += memcpy_len;
- out_len -= memcpy_len;
+ if(offset_within_block) {
+ blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
+ const size_t available_bytes = 64 - offset_within_block;
+ const size_t bytes = out_len > available_bytes ? available_bytes : out_len;
+ memcpy(out, wide_buf + offset_within_block, bytes);
+ out += bytes;
+ out_len -= bytes;
output_block_counter += 1;
- offset_within_block = 0;
+ }
+ if(out_len / 64) {
+ blake3_xof_many(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64);
+ }
+ output_block_counter += out_len / 64;
+ out += out_len & -64;
+ out_len -= out_len & -64;
+ if(out_len) {
+ blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
+ memcpy(out, wide_buf, out_len);
}
}
@@ -159,10 +165,10 @@ INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
// Given some input larger than one chunk, return the number of bytes that
// should go in the left subtree. This is the largest power-of-2 number of
// chunks that leaves at least 1 byte for the right subtree.
-INLINE size_t left_len(size_t content_len) {
- // Subtract 1 to reserve at least one byte for the right side. content_len
+INLINE size_t left_subtree_len(size_t input_len) {
+ // Subtract 1 to reserve at least one byte for the right side. input_len
// should always be greater than BLAKE3_CHUNK_LEN.
- size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
+ size_t full_chunks = (input_len - 1) / BLAKE3_CHUNK_LEN;
return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
}
@@ -251,7 +257,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
// The wide helper function returns (writes out) an array of chaining values
// and returns the length of that array. The number of chaining values returned
-// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
+// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
// if the input is shorter than that many chunks. The reason for maintaining a
// wide array of chaining values going back up the tree, is to allow the
// implementation to hash as many parents in parallel as possible.
@@ -259,18 +265,17 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
// As a special case when the SIMD degree is 1, this function will still return
// at least 2 outputs. This guarantees that this function doesn't perform the
// root compression. (If it did, it would use the wrong flags, and also we
-// wouldn't be able to implement exendable ouput.) Note that this function is
+// wouldn't be able to implement extendable output.) Note that this function is
// not used when the whole input is only 1 chunk long; that's a
diff erent
// codepath.
//
// Why not just have the caller split the input on the first update(), instead
// of implementing this special rule? Because we don't want to limit SIMD or
// multi-threading parallelism for that update().
-static size_t blake3_compress_subtree_wide(const uint8_t *input,
- size_t input_len,
- const uint32_t key[8],
- uint64_t chunk_counter,
- uint8_t flags, uint8_t *out) {
+size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
+ const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags,
+ uint8_t *out, bool use_tbb) {
// Note that the single chunk case does *not* bump the SIMD degree up to 2
// when it is 1. If this implementation adds multi-threading in the future,
// this gives us the option of multi-threading even the 2-chunk case, which
@@ -284,7 +289,7 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
// the input into left and right subtrees. (Note that this is only optimal
// as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
// of 3 or something, we'll need a more complicated strategy.)
- size_t left_input_len = left_len(input_len);
+ size_t left_input_len = left_subtree_len(input_len);
size_t right_input_len = input_len - left_input_len;
const uint8_t *right_input = &input[left_input_len];
uint64_t right_chunk_counter =
@@ -304,12 +309,24 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
}
uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
- // Recurse! If this implementation adds multi-threading support in the
- // future, this is where it will go.
- size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
- chunk_counter, flags, cv_array);
- size_t right_n = blake3_compress_subtree_wide(
- right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
+ // Recurse!
+ size_t left_n = -1;
+ size_t right_n = -1;
+
+#if defined(BLAKE3_USE_TBB)
+ blake3_compress_subtree_wide_join_tbb(
+ key, flags, use_tbb,
+ // left-hand side
+ input, left_input_len, chunk_counter, cv_array, &left_n,
+ // right-hand side
+ right_input, right_input_len, right_chunk_counter, right_cvs, &right_n);
+#else
+ left_n = blake3_compress_subtree_wide(
+ input, left_input_len, key, chunk_counter, flags, cv_array, use_tbb);
+ right_n = blake3_compress_subtree_wide(right_input, right_input_len, key,
+ right_chunk_counter, flags, right_cvs,
+ use_tbb);
+#endif // BLAKE3_USE_TBB
// The special case again. If simd_degree=1, then we'll have left_n=1 and
// right_n=1. Rather than compressing them into a single output, return
@@ -335,32 +352,37 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
//
// As with compress_subtree_wide(), this function is not used on inputs of 1
// chunk or less. That's a
diff erent codepath.
-INLINE void compress_subtree_to_parent_node(
- const uint8_t *input, size_t input_len, const uint32_t key[8],
- uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) {
+INLINE void
+compress_subtree_to_parent_node(const uint8_t *input, size_t input_len,
+ const uint32_t key[8], uint64_t chunk_counter,
+ uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN],
+ bool use_tbb) {
#if defined(BLAKE3_TESTING)
assert(input_len > BLAKE3_CHUNK_LEN);
#endif
uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
- chunk_counter, flags, cv_array);
+ chunk_counter, flags, cv_array, use_tbb);
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
-
- // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
+ // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
+ // as we just asserted, num_cvs will always be <=2 in that case. But GCC
+ // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
+ // set then it emits incorrect warnings here. We tried a few
diff erent
+ // hacks to silence these, but in the end our hacks just produced
diff erent
+ // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
+ // desperation, we ifdef out this entire loop when we know it's not needed.
+#if MAX_SIMD_DEGREE_OR_2 > 2
+ // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
// compress_subtree_wide() returns more than 2 chaining values. Condense
// them into 2 by forming parent nodes repeatedly.
uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
- // The second half of this loop condition is always true, and we just
- // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
- // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
- // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
- // this code, test it against that version.
- while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
+ while (num_cvs > 2) {
num_cvs =
compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
}
+#endif
memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
}
@@ -432,7 +454,7 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
// of the whole tree, and it would need to be ROOT finalized. We can't
// compress it until we know.
// 2) This 64 KiB input might complete a larger tree, whose root node is
-// similarly going to be the the root of the whole tree. For example, maybe
+// similarly going to be the root of the whole tree. For example, maybe
// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
// node at the root of the 256 KiB subtree until we know how to finalize it.
//
@@ -457,8 +479,8 @@ INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
self->cv_stack_len += 1;
}
-void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
- size_t input_len) {
+INLINE void blake3_hasher_update_base(blake3_hasher *self, const void *input,
+ size_t input_len, bool use_tbb) {
// Explicitly checking for zero avoids causing UB by passing a null pointer
// to memcpy. This comes up in practice with things like:
// std::vector<uint8_t> v;
@@ -544,7 +566,7 @@ void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
self->chunk.chunk_counter,
- self->chunk.flags, cv_pair);
+ self->chunk.flags, cv_pair, use_tbb);
hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
self->chunk.chunk_counter + (subtree_chunks / 2));
@@ -566,6 +588,20 @@ void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
}
}
+void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
+ size_t input_len) {
+ bool use_tbb = false;
+ blake3_hasher_update_base(self, input, input_len, use_tbb);
+}
+
+#if defined(BLAKE3_USE_TBB)
+void blake3_hasher_update_tbb(blake3_hasher *self, const void *input,
+ size_t input_len) {
+ bool use_tbb = true;
+ blake3_hasher_update_base(self, input, input_len, use_tbb);
+}
+#endif // BLAKE3_USE_TBB
+
void llvm_blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
size_t out_len) {
llvm_blake3_hasher_finalize_seek(self, 0, out, out_len);
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx2.c b/llvm/lib/Support/BLAKE3/blake3_avx2.c
index e76aa1a3aeb3d..381e7c422f33c 100644
--- a/llvm/lib/Support/BLAKE3/blake3_avx2.c
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2.c
@@ -167,7 +167,7 @@ INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
__m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
__m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
- // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+ // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
// 11/33.
__m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
__m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S
index 5ad1c641a7fc3..84c6c28670908 100644
--- a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S
@@ -1786,7 +1786,7 @@ blake3_hash_many_avx2:
vmovdqu xmmword ptr [rbx+0x10], xmm1
jmp 4b
-.section .rodata
+.section .rdata
.p2align 6
ADD0:
.long 0, 1, 2, 3, 4, 5, 6, 7
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx512.c b/llvm/lib/Support/BLAKE3/blake3_avx512.c
index 9c35b08c439ae..a7a0d00f6fc04 100644
--- a/llvm/lib/Support/BLAKE3/blake3_avx512.c
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512.c
@@ -22,10 +22,14 @@ INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
_mm_storeu_si128((__m128i *)dest, src);
}
-INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
+INLINE void storeu_256(__m256i src, uint8_t dest[32]) {
_mm256_storeu_si256((__m256i *)dest, src);
}
+INLINE void storeu_512(__m512i src, uint8_t dest[64]) {
+ _mm512_storeu_si512((__m512i *)dest, src);
+}
+
INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
@@ -429,7 +433,7 @@ INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
}
INLINE void transpose_vecs_128(__m128i vecs[4]) {
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+ // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
// 22/33. Note that this doesn't split the vector into two lanes, as the
// AVX2 counterparts do.
__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@@ -550,6 +554,54 @@ void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
}
+static
+void blake3_xof4_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags,
+ uint8_t out[4 * 64]) {
+ __m128i h_vecs[8] = {
+ set1_128(cv[0]), set1_128(cv[1]), set1_128(cv[2]), set1_128(cv[3]),
+ set1_128(cv[4]), set1_128(cv[5]), set1_128(cv[6]), set1_128(cv[7]),
+ };
+ uint32_t block_words[16];
+ load_block_words(block, block_words);
+ __m128i msg_vecs[16];
+ for (size_t i = 0; i < 16; i++) {
+ msg_vecs[i] = set1_128(block_words[i]);
+ }
+ __m128i counter_low_vec, counter_high_vec;
+ load_counters4(counter, true, &counter_low_vec, &counter_high_vec);
+ __m128i block_len_vec = set1_128(block_len);
+ __m128i block_flags_vec = set1_128(flags);
+ __m128i v[16] = {
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+ };
+ round_fn4(v, msg_vecs, 0);
+ round_fn4(v, msg_vecs, 1);
+ round_fn4(v, msg_vecs, 2);
+ round_fn4(v, msg_vecs, 3);
+ round_fn4(v, msg_vecs, 4);
+ round_fn4(v, msg_vecs, 5);
+ round_fn4(v, msg_vecs, 6);
+ for (size_t i = 0; i < 8; i++) {
+ v[i] = xor_128(v[i], v[i+8]);
+ v[i+8] = xor_128(v[i+8], h_vecs[i]);
+ }
+ transpose_vecs_128(&v[0]);
+ transpose_vecs_128(&v[4]);
+ transpose_vecs_128(&v[8]);
+ transpose_vecs_128(&v[12]);
+ for (size_t i = 0; i < 4; i++) {
+ storeu_128(v[i+ 0], &out[(4*i+0) * sizeof(__m128i)]);
+ storeu_128(v[i+ 4], &out[(4*i+1) * sizeof(__m128i)]);
+ storeu_128(v[i+ 8], &out[(4*i+2) * sizeof(__m128i)]);
+ storeu_128(v[i+12], &out[(4*i+3) * sizeof(__m128i)]);
+ }
+}
+
/*
* ----------------------------------------------------------------------------
* hash8_avx512
@@ -684,7 +736,7 @@ INLINE void transpose_vecs_256(__m256i vecs[8]) {
__m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
__m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
- // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+ // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
// 11/33.
__m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
__m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
@@ -802,6 +854,50 @@ void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
}
+static
+void blake3_xof8_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags,
+ uint8_t out[8 * 64]) {
+ __m256i h_vecs[8] = {
+ set1_256(cv[0]), set1_256(cv[1]), set1_256(cv[2]), set1_256(cv[3]),
+ set1_256(cv[4]), set1_256(cv[5]), set1_256(cv[6]), set1_256(cv[7]),
+ };
+ uint32_t block_words[16];
+ load_block_words(block, block_words);
+ __m256i msg_vecs[16];
+ for (size_t i = 0; i < 16; i++) {
+ msg_vecs[i] = set1_256(block_words[i]);
+ }
+ __m256i counter_low_vec, counter_high_vec;
+ load_counters8(counter, true, &counter_low_vec, &counter_high_vec);
+ __m256i block_len_vec = set1_256(block_len);
+ __m256i block_flags_vec = set1_256(flags);
+ __m256i v[16] = {
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]),
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+ };
+ round_fn8(v, msg_vecs, 0);
+ round_fn8(v, msg_vecs, 1);
+ round_fn8(v, msg_vecs, 2);
+ round_fn8(v, msg_vecs, 3);
+ round_fn8(v, msg_vecs, 4);
+ round_fn8(v, msg_vecs, 5);
+ round_fn8(v, msg_vecs, 6);
+ for (size_t i = 0; i < 8; i++) {
+ v[i] = xor_256(v[i], v[i+8]);
+ v[i+8] = xor_256(v[i+8], h_vecs[i]);
+ }
+ transpose_vecs_256(&v[0]);
+ transpose_vecs_256(&v[8]);
+ for (size_t i = 0; i < 8; i++) {
+ storeu_256(v[i+0], &out[(2*i+0) * sizeof(__m256i)]);
+ storeu_256(v[i+8], &out[(2*i+1) * sizeof(__m256i)]);
+ }
+}
+
/*
* ----------------------------------------------------------------------------
* hash16_avx512
@@ -959,7 +1055,7 @@ INLINE void transpose_vecs_512(__m512i vecs[16]) {
__m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
__m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
- // Interleave 64-bit lates. The _0 unpack is lanes
+ // Interleave 64-bit lanes. The _0 unpack is lanes
// 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
// 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
// 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
@@ -1047,13 +1143,26 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
INLINE void load_counters16(uint64_t counter, bool increment_counter,
__m512i *out_lo, __m512i *out_hi) {
const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
- const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
- const __m512i add1 = _mm512_and_si512(mask, add0);
- __m512i l = _mm512_add_epi32(_mm512_set1_epi32((int32_t)counter), add1);
- __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT);
- __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32((int32_t)(counter >> 32)), carry, _mm512_set1_epi32((int32_t)(counter >> 32)), _mm512_set1_epi32(1));
- *out_lo = l;
- *out_hi = h;
+ const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
+ const __m512i low_words = _mm512_add_epi32(
+ _mm512_set1_epi32((int32_t)counter),
+ masked_deltas);
+ // The carry bit is 1 if the high bit of the word was 1 before addition and is
+ // 0 after.
+ // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
+ // compute the carry bits here, and originally we did, but that intrinsic is
+ // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
+ const __m512i carries = _mm512_srli_epi32(
+ _mm512_andnot_si512(
+ low_words, // 0 after (gets inverted by andnot)
+ _mm512_set1_epi32((int32_t)counter)), // and 1 before
+ 31);
+ const __m512i high_words = _mm512_add_epi32(
+ _mm512_set1_epi32((int32_t)(counter >> 32)),
+ carries);
+ *out_lo = low_words;
+ *out_hi = high_words;
}
static
@@ -1133,6 +1242,48 @@ void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
_mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
}
+static
+void blake3_xof16_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags,
+ uint8_t out[16 * 64]) {
+ __m512i h_vecs[8] = {
+ set1_512(cv[0]), set1_512(cv[1]), set1_512(cv[2]), set1_512(cv[3]),
+ set1_512(cv[4]), set1_512(cv[5]), set1_512(cv[6]), set1_512(cv[7]),
+ };
+ uint32_t block_words[16];
+ load_block_words(block, block_words);
+ __m512i msg_vecs[16];
+ for (size_t i = 0; i < 16; i++) {
+ msg_vecs[i] = set1_512(block_words[i]);
+ }
+ __m512i counter_low_vec, counter_high_vec;
+ load_counters16(counter, true, &counter_low_vec, &counter_high_vec);
+ __m512i block_len_vec = set1_512(block_len);
+ __m512i block_flags_vec = set1_512(flags);
+ __m512i v[16] = {
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]),
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+ };
+ round_fn16(v, msg_vecs, 0);
+ round_fn16(v, msg_vecs, 1);
+ round_fn16(v, msg_vecs, 2);
+ round_fn16(v, msg_vecs, 3);
+ round_fn16(v, msg_vecs, 4);
+ round_fn16(v, msg_vecs, 5);
+ round_fn16(v, msg_vecs, 6);
+ for (size_t i = 0; i < 8; i++) {
+ v[i] = xor_512(v[i], v[i+8]);
+ v[i+8] = xor_512(v[i+8], h_vecs[i]);
+ }
+ transpose_vecs_512(&v[0]);
+ for (size_t i = 0; i < 16; i++) {
+ storeu_512(v[i], &out[i * sizeof(__m512i)]);
+ }
+}
+
/*
* ----------------------------------------------------------------------------
* hash_many_avx512
@@ -1205,3 +1356,33 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
out = &out[BLAKE3_OUT_LEN];
}
}
+
+void blake3_xof_many_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags,
+ uint8_t* out, size_t outblocks) {
+ while (outblocks >= 16) {
+ blake3_xof16_avx512(cv, block, block_len, counter, flags, out);
+ counter += 16;
+ outblocks -= 16;
+ out += 16 * BLAKE3_BLOCK_LEN;
+ }
+ while (outblocks >= 8) {
+ blake3_xof8_avx512(cv, block, block_len, counter, flags, out);
+ counter += 8;
+ outblocks -= 8;
+ out += 8 * BLAKE3_BLOCK_LEN;
+ }
+ while (outblocks >= 4) {
+ blake3_xof4_avx512(cv, block, block_len, counter, flags, out);
+ counter += 4;
+ outblocks -= 4;
+ out += 4 * BLAKE3_BLOCK_LEN;
+ }
+ while (outblocks > 0) {
+ blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
+ counter += 1;
+ outblocks -= 1;
+ out += BLAKE3_BLOCK_LEN;
+ }
+}
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
index 224605147c3d7..b4b14946de10e 100644
--- a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
@@ -29,12 +29,16 @@ HIDDEN blake3_compress_in_place_avx512
HIDDEN _blake3_compress_in_place_avx512
HIDDEN blake3_compress_xof_avx512
HIDDEN _blake3_compress_xof_avx512
+HIDDEN blake3_xof_many_avx512
+HIDDEN _blake3_xof_many_avx512
.global _blake3_hash_many_avx512
.global blake3_hash_many_avx512
.global blake3_compress_in_place_avx512
.global _blake3_compress_in_place_avx512
.global blake3_compress_xof_avx512
.global _blake3_compress_xof_avx512
+.global blake3_xof_many_avx512
+.global _blake3_xof_many_avx512
#ifdef __APPLE__
.text
@@ -2569,6 +2573,2243 @@ blake3_compress_xof_avx512:
vmovdqu xmmword ptr [r9+0x30], xmm3
ret
+.p2align 6
+blake3_xof_many_avx512:
+_blake3_xof_many_avx512:
+ _CET_ENDBR
+ mov r10,QWORD PTR [rsp+0x8]
+ cmp r10,0x1
+ ja 2f
+ vmovdqu xmm0,XMMWORD PTR [rdi]
+ vmovdqu xmm1,XMMWORD PTR [rdi+0x10]
+ movzx eax,r8b
+ movzx edx,dl
+ shl rax,0x20
+ add rdx,rax
+ vmovq xmm3,rcx
+ vmovq xmm4,rdx
+ vpunpcklqdq xmm3,xmm3,xmm4
+ vmovaps xmm2,XMMWORD PTR [BLAKE3_IV+rip]
+ vmovups xmm8,XMMWORD PTR [rsi]
+ vmovups xmm9,XMMWORD PTR [rsi+0x10]
+ vshufps xmm4,xmm8,xmm9,0x88
+ vshufps xmm5,xmm8,xmm9,0xdd
+ vmovups xmm8,XMMWORD PTR [rsi+0x20]
+ vmovups xmm9,XMMWORD PTR [rsi+0x30]
+ vshufps xmm6,xmm8,xmm9,0x88
+ vshufps xmm7,xmm8,xmm9,0xdd
+ vpshufd xmm6,xmm6,0x93
+ vpshufd xmm7,xmm7,0x93
+ mov al,0x7
+3:
+ vpaddd xmm0,xmm0,xmm4
+ vpaddd xmm0,xmm0,xmm1
+ vpxord xmm3,xmm3,xmm0
+ vprord xmm3,xmm3,0x10
+ vpaddd xmm2,xmm2,xmm3
+ vpxord xmm1,xmm1,xmm2
+ vprord xmm1,xmm1,0xc
+ vpaddd xmm0,xmm0,xmm5
+ vpaddd xmm0,xmm0,xmm1
+ vpxord xmm3,xmm3,xmm0
+ vprord xmm3,xmm3,0x8
+ vpaddd xmm2,xmm2,xmm3
+ vpxord xmm1,xmm1,xmm2
+ vprord xmm1,xmm1,0x7
+ vpshufd xmm0,xmm0,0x93
+ vpshufd xmm3,xmm3,0x4e
+ vpshufd xmm2,xmm2,0x39
+ vpaddd xmm0,xmm0,xmm6
+ vpaddd xmm0,xmm0,xmm1
+ vpxord xmm3,xmm3,xmm0
+ vprord xmm3,xmm3,0x10
+ vpaddd xmm2,xmm2,xmm3
+ vpxord xmm1,xmm1,xmm2
+ vprord xmm1,xmm1,0xc
+ vpaddd xmm0,xmm0,xmm7
+ vpaddd xmm0,xmm0,xmm1
+ vpxord xmm3,xmm3,xmm0
+ vprord xmm3,xmm3,0x8
+ vpaddd xmm2,xmm2,xmm3
+ vpxord xmm1,xmm1,xmm2
+ vprord xmm1,xmm1,0x7
+ vpshufd xmm0,xmm0,0x39
+ vpshufd xmm3,xmm3,0x4e
+ vpshufd xmm2,xmm2,0x93
+ dec al
+ je 3f
+ vshufps xmm8,xmm4,xmm5,0xd6
+ vpshufd xmm9,xmm4,0xf
+ vpshufd xmm4,xmm8,0x39
+ vshufps xmm8,xmm6,xmm7,0xfa
+ vpblendd xmm9,xmm9,xmm8,0xaa
+ vpunpcklqdq xmm8,xmm7,xmm5
+ vpblendd xmm8,xmm8,xmm6,0x88
+ vpshufd xmm8,xmm8,0x78
+ vpunpckhdq xmm5,xmm5,xmm7
+ vpunpckldq xmm6,xmm6,xmm5
+ vpshufd xmm7,xmm6,0x1e
+ vmovdqa xmm5,xmm9
+ vmovdqa xmm6,xmm8
+ jmp 3b
+3:
+ vpxor xmm0,xmm0,xmm2
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm2,xmm2,XMMWORD PTR [rdi]
+ vpxor xmm3,xmm3,XMMWORD PTR [rdi+0x10]
+ vmovdqu XMMWORD PTR [r9],xmm0
+ vmovdqu XMMWORD PTR [r9+0x10],xmm1
+ vmovdqu XMMWORD PTR [r9+0x20],xmm2
+ vmovdqu XMMWORD PTR [r9+0x30],xmm3
+ ret
+.p2align 6
+2:
+ push rbp
+ mov rbp,rsp
+ sub rsp,0x90
+ and rsp,0xffffffffffffffc0
+ vpbroadcastd zmm0,ecx
+ shr rcx,0x20
+ vpbroadcastd zmm1,ecx
+ vpaddd zmm2,zmm0,ZMMWORD PTR [ADD0+rip]
+ vpcmpltud k1,zmm2,zmm0
+ vpaddd zmm1{k1},zmm1,DWORD PTR [ADD1+rip]{1to16}
+ vmovdqa32 ZMMWORD PTR [rsp],zmm2
+ vmovdqa32 ZMMWORD PTR [rsp+0x40],zmm1
+ cmp r10,0x10
+ jb 2f
+3:
+ vpbroadcastd zmm16,DWORD PTR [rsi]
+ vpbroadcastd zmm17,DWORD PTR [rsi+0x4]
+ vpbroadcastd zmm18,DWORD PTR [rsi+0x8]
+ vpbroadcastd zmm19,DWORD PTR [rsi+0xc]
+ vpbroadcastd zmm20,DWORD PTR [rsi+0x10]
+ vpbroadcastd zmm21,DWORD PTR [rsi+0x14]
+ vpbroadcastd zmm22,DWORD PTR [rsi+0x18]
+ vpbroadcastd zmm23,DWORD PTR [rsi+0x1c]
+ vpbroadcastd zmm24,DWORD PTR [rsi+0x20]
+ vpbroadcastd zmm25,DWORD PTR [rsi+0x24]
+ vpbroadcastd zmm26,DWORD PTR [rsi+0x28]
+ vpbroadcastd zmm27,DWORD PTR [rsi+0x2c]
+ vpbroadcastd zmm28,DWORD PTR [rsi+0x30]
+ vpbroadcastd zmm29,DWORD PTR [rsi+0x34]
+ vpbroadcastd zmm30,DWORD PTR [rsi+0x38]
+ vpbroadcastd zmm31,DWORD PTR [rsi+0x3c]
+ vpbroadcastd zmm0,DWORD PTR [rdi]
+ vpbroadcastd zmm1,DWORD PTR [rdi+0x4]
+ vpbroadcastd zmm2,DWORD PTR [rdi+0x8]
+ vpbroadcastd zmm3,DWORD PTR [rdi+0xc]
+ vpbroadcastd zmm4,DWORD PTR [rdi+0x10]
+ vpbroadcastd zmm5,DWORD PTR [rdi+0x14]
+ vpbroadcastd zmm6,DWORD PTR [rdi+0x18]
+ vpbroadcastd zmm7,DWORD PTR [rdi+0x1c]
+ vpbroadcastd zmm8,DWORD PTR [BLAKE3_IV_0+rip]
+ vpbroadcastd zmm9,DWORD PTR [BLAKE3_IV_1+rip]
+ vpbroadcastd zmm10,DWORD PTR [BLAKE3_IV_2+rip]
+ vpbroadcastd zmm11,DWORD PTR [BLAKE3_IV_3+rip]
+ vmovdqa32 zmm12,ZMMWORD PTR [rsp]
+ vmovdqa32 zmm13,ZMMWORD PTR [rsp+0x40]
+ vpbroadcastd zmm14,edx
+ vpbroadcastd zmm15,r8d
+ vpaddd zmm0,zmm0,zmm16
+ vpaddd zmm1,zmm1,zmm18
+ vpaddd zmm2,zmm2,zmm20
+ vpaddd zmm3,zmm3,zmm22
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm1,zmm1,zmm5
+ vpaddd zmm2,zmm2,zmm6
+ vpaddd zmm3,zmm3,zmm7
+ vpxord zmm12,zmm12,zmm0
+ vpxord zmm13,zmm13,zmm1
+ vpxord zmm14,zmm14,zmm2
+ vpxord zmm15,zmm15,zmm3
+ vprord zmm12,zmm12,0x10
+ vprord zmm13,zmm13,0x10
+ vprord zmm14,zmm14,0x10
+ vprord zmm15,zmm15,0x10
+ vpaddd zmm8,zmm8,zmm12
+ vpaddd zmm9,zmm9,zmm13
+ vpaddd zmm10,zmm10,zmm14
+ vpaddd zmm11,zmm11,zmm15
+ vpxord zmm4,zmm4,zmm8
+ vpxord zmm5,zmm5,zmm9
+ vpxord zmm6,zmm6,zmm10
+ vpxord zmm7,zmm7,zmm11
+ vprord zmm4,zmm4,0xc
+ vprord zmm5,zmm5,0xc
+ vprord zmm6,zmm6,0xc
+ vprord zmm7,zmm7,0xc
+ vpaddd zmm0,zmm0,zmm17
+ vpaddd zmm1,zmm1,zmm19
+ vpaddd zmm2,zmm2,zmm21
+ vpaddd zmm3,zmm3,zmm23
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm1,zmm1,zmm5
+ vpaddd zmm2,zmm2,zmm6
+ vpaddd zmm3,zmm3,zmm7
+ vpxord zmm12,zmm12,zmm0
+ vpxord zmm13,zmm13,zmm1
+ vpxord zmm14,zmm14,zmm2
+ vpxord zmm15,zmm15,zmm3
+ vprord zmm12,zmm12,0x8
+ vprord zmm13,zmm13,0x8
+ vprord zmm14,zmm14,0x8
+ vprord zmm15,zmm15,0x8
+ vpaddd zmm8,zmm8,zmm12
+ vpaddd zmm9,zmm9,zmm13
+ vpaddd zmm10,zmm10,zmm14
+ vpaddd zmm11,zmm11,zmm15
+ vpxord zmm4,zmm4,zmm8
+ vpxord zmm5,zmm5,zmm9
+ vpxord zmm6,zmm6,zmm10
+ vpxord zmm7,zmm7,zmm11
+ vprord zmm4,zmm4,0x7
+ vprord zmm5,zmm5,0x7
+ vprord zmm6,zmm6,0x7
+ vprord zmm7,zmm7,0x7
+ vpaddd zmm0,zmm0,zmm24
+ vpaddd zmm1,zmm1,zmm26
+ vpaddd zmm2,zmm2,zmm28
+ vpaddd zmm3,zmm3,zmm30
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm1,zmm1,zmm6
+ vpaddd zmm2,zmm2,zmm7
+ vpaddd zmm3,zmm3,zmm4
+ vpxord zmm15,zmm15,zmm0
+ vpxord zmm12,zmm12,zmm1
+ vpxord zmm13,zmm13,zmm2
+ vpxord zmm14,zmm14,zmm3
+ vprord zmm15,zmm15,0x10
+ vprord zmm12,zmm12,0x10
+ vprord zmm13,zmm13,0x10
+ vprord zmm14,zmm14,0x10
+ vpaddd zmm10,zmm10,zmm15
+ vpaddd zmm11,zmm11,zmm12
+ vpaddd zmm8,zmm8,zmm13
+ vpaddd zmm9,zmm9,zmm14
+ vpxord zmm5,zmm5,zmm10
+ vpxord zmm6,zmm6,zmm11
+ vpxord zmm7,zmm7,zmm8
+ vpxord zmm4,zmm4,zmm9
+ vprord zmm5,zmm5,0xc
+ vprord zmm6,zmm6,0xc
+ vprord zmm7,zmm7,0xc
+ vprord zmm4,zmm4,0xc
+ vpaddd zmm0,zmm0,zmm25
+ vpaddd zmm1,zmm1,zmm27
+ vpaddd zmm2,zmm2,zmm29
+ vpaddd zmm3,zmm3,zmm31
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm1,zmm1,zmm6
+ vpaddd zmm2,zmm2,zmm7
+ vpaddd zmm3,zmm3,zmm4
+ vpxord zmm15,zmm15,zmm0
+ vpxord zmm12,zmm12,zmm1
+ vpxord zmm13,zmm13,zmm2
+ vpxord zmm14,zmm14,zmm3
+ vprord zmm15,zmm15,0x8
+ vprord zmm12,zmm12,0x8
+ vprord zmm13,zmm13,0x8
+ vprord zmm14,zmm14,0x8
+ vpaddd zmm10,zmm10,zmm15
+ vpaddd zmm11,zmm11,zmm12
+ vpaddd zmm8,zmm8,zmm13
+ vpaddd zmm9,zmm9,zmm14
+ vpxord zmm5,zmm5,zmm10
+ vpxord zmm6,zmm6,zmm11
+ vpxord zmm7,zmm7,zmm8
+ vpxord zmm4,zmm4,zmm9
+ vprord zmm5,zmm5,0x7
+ vprord zmm6,zmm6,0x7
+ vprord zmm7,zmm7,0x7
+ vprord zmm4,zmm4,0x7
+ vpaddd zmm0,zmm0,zmm18
+ vpaddd zmm1,zmm1,zmm19
+ vpaddd zmm2,zmm2,zmm23
+ vpaddd zmm3,zmm3,zmm20
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm1,zmm1,zmm5
+ vpaddd zmm2,zmm2,zmm6
+ vpaddd zmm3,zmm3,zmm7
+ vpxord zmm12,zmm12,zmm0
+ vpxord zmm13,zmm13,zmm1
+ vpxord zmm14,zmm14,zmm2
+ vpxord zmm15,zmm15,zmm3
+ vprord zmm12,zmm12,0x10
+ vprord zmm13,zmm13,0x10
+ vprord zmm14,zmm14,0x10
+ vprord zmm15,zmm15,0x10
+ vpaddd zmm8,zmm8,zmm12
+ vpaddd zmm9,zmm9,zmm13
+ vpaddd zmm10,zmm10,zmm14
+ vpaddd zmm11,zmm11,zmm15
+ vpxord zmm4,zmm4,zmm8
+ vpxord zmm5,zmm5,zmm9
+ vpxord zmm6,zmm6,zmm10
+ vpxord zmm7,zmm7,zmm11
+ vprord zmm4,zmm4,0xc
+ vprord zmm5,zmm5,0xc
+ vprord zmm6,zmm6,0xc
+ vprord zmm7,zmm7,0xc
+ vpaddd zmm0,zmm0,zmm22
+ vpaddd zmm1,zmm1,zmm26
+ vpaddd zmm2,zmm2,zmm16
+ vpaddd zmm3,zmm3,zmm29
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm1,zmm1,zmm5
+ vpaddd zmm2,zmm2,zmm6
+ vpaddd zmm3,zmm3,zmm7
+ vpxord zmm12,zmm12,zmm0
+ vpxord zmm13,zmm13,zmm1
+ vpxord zmm14,zmm14,zmm2
+ vpxord zmm15,zmm15,zmm3
+ vprord zmm12,zmm12,0x8
+ vprord zmm13,zmm13,0x8
+ vprord zmm14,zmm14,0x8
+ vprord zmm15,zmm15,0x8
+ vpaddd zmm8,zmm8,zmm12
+ vpaddd zmm9,zmm9,zmm13
+ vpaddd zmm10,zmm10,zmm14
+ vpaddd zmm11,zmm11,zmm15
+ vpxord zmm4,zmm4,zmm8
+ vpxord zmm5,zmm5,zmm9
+ vpxord zmm6,zmm6,zmm10
+ vpxord zmm7,zmm7,zmm11
+ vprord zmm4,zmm4,0x7
+ vprord zmm5,zmm5,0x7
+ vprord zmm6,zmm6,0x7
+ vprord zmm7,zmm7,0x7
+ vpaddd zmm0,zmm0,zmm17
+ vpaddd zmm1,zmm1,zmm28
+ vpaddd zmm2,zmm2,zmm25
+ vpaddd zmm3,zmm3,zmm31
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm1,zmm1,zmm6
+ vpaddd zmm2,zmm2,zmm7
+ vpaddd zmm3,zmm3,zmm4
+ vpxord zmm15,zmm15,zmm0
+ vpxord zmm12,zmm12,zmm1
+ vpxord zmm13,zmm13,zmm2
+ vpxord zmm14,zmm14,zmm3
+ vprord zmm15,zmm15,0x10
+ vprord zmm12,zmm12,0x10
+ vprord zmm13,zmm13,0x10
+ vprord zmm14,zmm14,0x10
+ vpaddd zmm10,zmm10,zmm15
+ vpaddd zmm11,zmm11,zmm12
+ vpaddd zmm8,zmm8,zmm13
+ vpaddd zmm9,zmm9,zmm14
+ vpxord zmm5,zmm5,zmm10
+ vpxord zmm6,zmm6,zmm11
+ vpxord zmm7,zmm7,zmm8
+ vpxord zmm4,zmm4,zmm9
+ vprord zmm5,zmm5,0xc
+ vprord zmm6,zmm6,0xc
+ vprord zmm7,zmm7,0xc
+ vprord zmm4,zmm4,0xc
+ vpaddd zmm0,zmm0,zmm27
+ vpaddd zmm1,zmm1,zmm21
+ vpaddd zmm2,zmm2,zmm30
+ vpaddd zmm3,zmm3,zmm24
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm1,zmm1,zmm6
+ vpaddd zmm2,zmm2,zmm7
+ vpaddd zmm3,zmm3,zmm4
+ vpxord zmm15,zmm15,zmm0
+ vpxord zmm12,zmm12,zmm1
+ vpxord zmm13,zmm13,zmm2
+ vpxord zmm14,zmm14,zmm3
+ vprord zmm15,zmm15,0x8
+ vprord zmm12,zmm12,0x8
+ vprord zmm13,zmm13,0x8
+ vprord zmm14,zmm14,0x8
+ vpaddd zmm10,zmm10,zmm15
+ vpaddd zmm11,zmm11,zmm12
+ vpaddd zmm8,zmm8,zmm13
+ vpaddd zmm9,zmm9,zmm14
+ vpxord zmm5,zmm5,zmm10
+ vpxord zmm6,zmm6,zmm11
+ vpxord zmm7,zmm7,zmm8
+ vpxord zmm4,zmm4,zmm9
+ vprord zmm5,zmm5,0x7
+ vprord zmm6,zmm6,0x7
+ vprord zmm7,zmm7,0x7
+ vprord zmm4,zmm4,0x7
+ vpaddd zmm0,zmm0,zmm19
+ vpaddd zmm1,zmm1,zmm26
+ vpaddd zmm2,zmm2,zmm29
+ vpaddd zmm3,zmm3,zmm23
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm1,zmm1,zmm5
+ vpaddd zmm2,zmm2,zmm6
+ vpaddd zmm3,zmm3,zmm7
+ vpxord zmm12,zmm12,zmm0
+ vpxord zmm13,zmm13,zmm1
+ vpxord zmm14,zmm14,zmm2
+ vpxord zmm15,zmm15,zmm3
+ vprord zmm12,zmm12,0x10
+ vprord zmm13,zmm13,0x10
+ vprord zmm14,zmm14,0x10
+ vprord zmm15,zmm15,0x10
+ vpaddd zmm8,zmm8,zmm12
+ vpaddd zmm9,zmm9,zmm13
+ vpaddd zmm10,zmm10,zmm14
+ vpaddd zmm11,zmm11,zmm15
+ vpxord zmm4,zmm4,zmm8
+ vpxord zmm5,zmm5,zmm9
+ vpxord zmm6,zmm6,zmm10
+ vpxord zmm7,zmm7,zmm11
+ vprord zmm4,zmm4,0xc
+ vprord zmm5,zmm5,0xc
+ vprord zmm6,zmm6,0xc
+ vprord zmm7,zmm7,0xc
+ vpaddd zmm0,zmm0,zmm20
+ vpaddd zmm1,zmm1,zmm28
+ vpaddd zmm2,zmm2,zmm18
+ vpaddd zmm3,zmm3,zmm30
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm1,zmm1,zmm5
+ vpaddd zmm2,zmm2,zmm6
+ vpaddd zmm3,zmm3,zmm7
+ vpxord zmm12,zmm12,zmm0
+ vpxord zmm13,zmm13,zmm1
+ vpxord zmm14,zmm14,zmm2
+ vpxord zmm15,zmm15,zmm3
+ vprord zmm12,zmm12,0x8
+ vprord zmm13,zmm13,0x8
+ vprord zmm14,zmm14,0x8
+ vprord zmm15,zmm15,0x8
+ vpaddd zmm8,zmm8,zmm12
+ vpaddd zmm9,zmm9,zmm13
+ vpaddd zmm10,zmm10,zmm14
+ vpaddd zmm11,zmm11,zmm15
+ vpxord zmm4,zmm4,zmm8
+ vpxord zmm5,zmm5,zmm9
+ vpxord zmm6,zmm6,zmm10
+ vpxord zmm7,zmm7,zmm11
+ vprord zmm4,zmm4,0x7
+ vprord zmm5,zmm5,0x7
+ vprord zmm6,zmm6,0x7
+ vprord zmm7,zmm7,0x7
+ vpaddd zmm0,zmm0,zmm22
+ vpaddd zmm1,zmm1,zmm25
+ vpaddd zmm2,zmm2,zmm27
+ vpaddd zmm3,zmm3,zmm24
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm1,zmm1,zmm6
+ vpaddd zmm2,zmm2,zmm7
+ vpaddd zmm3,zmm3,zmm4
+ vpxord zmm15,zmm15,zmm0
+ vpxord zmm12,zmm12,zmm1
+ vpxord zmm13,zmm13,zmm2
+ vpxord zmm14,zmm14,zmm3
+ vprord zmm15,zmm15,0x10
+ vprord zmm12,zmm12,0x10
+ vprord zmm13,zmm13,0x10
+ vprord zmm14,zmm14,0x10
+ vpaddd zmm10,zmm10,zmm15
+ vpaddd zmm11,zmm11,zmm12
+ vpaddd zmm8,zmm8,zmm13
+ vpaddd zmm9,zmm9,zmm14
+ vpxord zmm5,zmm5,zmm10
+ vpxord zmm6,zmm6,zmm11
+ vpxord zmm7,zmm7,zmm8
+ vpxord zmm4,zmm4,zmm9
+ vprord zmm5,zmm5,0xc
+ vprord zmm6,zmm6,0xc
+ vprord zmm7,zmm7,0xc
+ vprord zmm4,zmm4,0xc
+ vpaddd zmm0,zmm0,zmm21
+ vpaddd zmm1,zmm1,zmm16
+ vpaddd zmm2,zmm2,zmm31
+ vpaddd zmm3,zmm3,zmm17
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm1,zmm1,zmm6
+ vpaddd zmm2,zmm2,zmm7
+ vpaddd zmm3,zmm3,zmm4
+ vpxord zmm15,zmm15,zmm0
+ vpxord zmm12,zmm12,zmm1
+ vpxord zmm13,zmm13,zmm2
+ vpxord zmm14,zmm14,zmm3
+ vprord zmm15,zmm15,0x8
+ vprord zmm12,zmm12,0x8
+ vprord zmm13,zmm13,0x8
+ vprord zmm14,zmm14,0x8
+ vpaddd zmm10,zmm10,zmm15
+ vpaddd zmm11,zmm11,zmm12
+ vpaddd zmm8,zmm8,zmm13
+ vpaddd zmm9,zmm9,zmm14
+ vpxord zmm5,zmm5,zmm10
+ vpxord zmm6,zmm6,zmm11
+ vpxord zmm7,zmm7,zmm8
+ vpxord zmm4,zmm4,zmm9
+ vprord zmm5,zmm5,0x7
+ vprord zmm6,zmm6,0x7
+ vprord zmm7,zmm7,0x7
+ vprord zmm4,zmm4,0x7
+ vpaddd zmm0,zmm0,zmm26
+ vpaddd zmm1,zmm1,zmm28
+ vpaddd zmm2,zmm2,zmm30
+ vpaddd zmm3,zmm3,zmm29
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm1,zmm1,zmm5
+ vpaddd zmm2,zmm2,zmm6
+ vpaddd zmm3,zmm3,zmm7
+ vpxord zmm12,zmm12,zmm0
+ vpxord zmm13,zmm13,zmm1
+ vpxord zmm14,zmm14,zmm2
+ vpxord zmm15,zmm15,zmm3
+ vprord zmm12,zmm12,0x10
+ vprord zmm13,zmm13,0x10
+ vprord zmm14,zmm14,0x10
+ vprord zmm15,zmm15,0x10
+ vpaddd zmm8,zmm8,zmm12
+ vpaddd zmm9,zmm9,zmm13
+ vpaddd zmm10,zmm10,zmm14
+ vpaddd zmm11,zmm11,zmm15
+ vpxord zmm4,zmm4,zmm8
+ vpxord zmm5,zmm5,zmm9
+ vpxord zmm6,zmm6,zmm10
+ vpxord zmm7,zmm7,zmm11
+ vprord zmm4,zmm4,0xc
+ vprord zmm5,zmm5,0xc
+ vprord zmm6,zmm6,0xc
+ vprord zmm7,zmm7,0xc
+ vpaddd zmm0,zmm0,zmm23
+ vpaddd zmm1,zmm1,zmm25
+ vpaddd zmm2,zmm2,zmm19
+ vpaddd zmm3,zmm3,zmm31
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm1,zmm1,zmm5
+ vpaddd zmm2,zmm2,zmm6
+ vpaddd zmm3,zmm3,zmm7
+ vpxord zmm12,zmm12,zmm0
+ vpxord zmm13,zmm13,zmm1
+ vpxord zmm14,zmm14,zmm2
+ vpxord zmm15,zmm15,zmm3
+ vprord zmm12,zmm12,0x8
+ vprord zmm13,zmm13,0x8
+ vprord zmm14,zmm14,0x8
+ vprord zmm15,zmm15,0x8
+ vpaddd zmm8,zmm8,zmm12
+ vpaddd zmm9,zmm9,zmm13
+ vpaddd zmm10,zmm10,zmm14
+ vpaddd zmm11,zmm11,zmm15
+ vpxord zmm4,zmm4,zmm8
+ vpxord zmm5,zmm5,zmm9
+ vpxord zmm6,zmm6,zmm10
+ vpxord zmm7,zmm7,zmm11
+ vprord zmm4,zmm4,0x7
+ vprord zmm5,zmm5,0x7
+ vprord zmm6,zmm6,0x7
+ vprord zmm7,zmm7,0x7
+ vpaddd zmm0,zmm0,zmm20
+ vpaddd zmm1,zmm1,zmm27
+ vpaddd zmm2,zmm2,zmm21
+ vpaddd zmm3,zmm3,zmm17
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm1,zmm1,zmm6
+ vpaddd zmm2,zmm2,zmm7
+ vpaddd zmm3,zmm3,zmm4
+ vpxord zmm15,zmm15,zmm0
+ vpxord zmm12,zmm12,zmm1
+ vpxord zmm13,zmm13,zmm2
+ vpxord zmm14,zmm14,zmm3
+ vprord zmm15,zmm15,0x10
+ vprord zmm12,zmm12,0x10
+ vprord zmm13,zmm13,0x10
+ vprord zmm14,zmm14,0x10
+ vpaddd zmm10,zmm10,zmm15
+ vpaddd zmm11,zmm11,zmm12
+ vpaddd zmm8,zmm8,zmm13
+ vpaddd zmm9,zmm9,zmm14
+ vpxord zmm5,zmm5,zmm10
+ vpxord zmm6,zmm6,zmm11
+ vpxord zmm7,zmm7,zmm8
+ vpxord zmm4,zmm4,zmm9
+ vprord zmm5,zmm5,0xc
+ vprord zmm6,zmm6,0xc
+ vprord zmm7,zmm7,0xc
+ vprord zmm4,zmm4,0xc
+ vpaddd zmm0,zmm0,zmm16
+ vpaddd zmm1,zmm1,zmm18
+ vpaddd zmm2,zmm2,zmm24
+ vpaddd zmm3,zmm3,zmm22
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm1,zmm1,zmm6
+ vpaddd zmm2,zmm2,zmm7
+ vpaddd zmm3,zmm3,zmm4
+ vpxord zmm15,zmm15,zmm0
+ vpxord zmm12,zmm12,zmm1
+ vpxord zmm13,zmm13,zmm2
+ vpxord zmm14,zmm14,zmm3
+ vprord zmm15,zmm15,0x8
+ vprord zmm12,zmm12,0x8
+ vprord zmm13,zmm13,0x8
+ vprord zmm14,zmm14,0x8
+ vpaddd zmm10,zmm10,zmm15
+ vpaddd zmm11,zmm11,zmm12
+ vpaddd zmm8,zmm8,zmm13
+ vpaddd zmm9,zmm9,zmm14
+ vpxord zmm5,zmm5,zmm10
+ vpxord zmm6,zmm6,zmm11
+ vpxord zmm7,zmm7,zmm8
+ vpxord zmm4,zmm4,zmm9
+ vprord zmm5,zmm5,0x7
+ vprord zmm6,zmm6,0x7
+ vprord zmm7,zmm7,0x7
+ vprord zmm4,zmm4,0x7
+ vpaddd zmm0,zmm0,zmm28
+ vpaddd zmm1,zmm1,zmm25
+ vpaddd zmm2,zmm2,zmm31
+ vpaddd zmm3,zmm3,zmm30
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm1,zmm1,zmm5
+ vpaddd zmm2,zmm2,zmm6
+ vpaddd zmm3,zmm3,zmm7
+ vpxord zmm12,zmm12,zmm0
+ vpxord zmm13,zmm13,zmm1
+ vpxord zmm14,zmm14,zmm2
+ vpxord zmm15,zmm15,zmm3
+ vprord zmm12,zmm12,0x10
+ vprord zmm13,zmm13,0x10
+ vprord zmm14,zmm14,0x10
+ vprord zmm15,zmm15,0x10
+ vpaddd zmm8,zmm8,zmm12
+ vpaddd zmm9,zmm9,zmm13
+ vpaddd zmm10,zmm10,zmm14
+ vpaddd zmm11,zmm11,zmm15
+ vpxord zmm4,zmm4,zmm8
+ vpxord zmm5,zmm5,zmm9
+ vpxord zmm6,zmm6,zmm10
+ vpxord zmm7,zmm7,zmm11
+ vprord zmm4,zmm4,0xc
+ vprord zmm5,zmm5,0xc
+ vprord zmm6,zmm6,0xc
+ vprord zmm7,zmm7,0xc
+ vpaddd zmm0,zmm0,zmm29
+ vpaddd zmm1,zmm1,zmm27
+ vpaddd zmm2,zmm2,zmm26
+ vpaddd zmm3,zmm3,zmm24
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm1,zmm1,zmm5
+ vpaddd zmm2,zmm2,zmm6
+ vpaddd zmm3,zmm3,zmm7
+ vpxord zmm12,zmm12,zmm0
+ vpxord zmm13,zmm13,zmm1
+ vpxord zmm14,zmm14,zmm2
+ vpxord zmm15,zmm15,zmm3
+ vprord zmm12,zmm12,0x8
+ vprord zmm13,zmm13,0x8
+ vprord zmm14,zmm14,0x8
+ vprord zmm15,zmm15,0x8
+ vpaddd zmm8,zmm8,zmm12
+ vpaddd zmm9,zmm9,zmm13
+ vpaddd zmm10,zmm10,zmm14
+ vpaddd zmm11,zmm11,zmm15
+ vpxord zmm4,zmm4,zmm8
+ vpxord zmm5,zmm5,zmm9
+ vpxord zmm6,zmm6,zmm10
+ vpxord zmm7,zmm7,zmm11
+ vprord zmm4,zmm4,0x7
+ vprord zmm5,zmm5,0x7
+ vprord zmm6,zmm6,0x7
+ vprord zmm7,zmm7,0x7
+ vpaddd zmm0,zmm0,zmm23
+ vpaddd zmm1,zmm1,zmm21
+ vpaddd zmm2,zmm2,zmm16
+ vpaddd zmm3,zmm3,zmm22
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm1,zmm1,zmm6
+ vpaddd zmm2,zmm2,zmm7
+ vpaddd zmm3,zmm3,zmm4
+ vpxord zmm15,zmm15,zmm0
+ vpxord zmm12,zmm12,zmm1
+ vpxord zmm13,zmm13,zmm2
+ vpxord zmm14,zmm14,zmm3
+ vprord zmm15,zmm15,0x10
+ vprord zmm12,zmm12,0x10
+ vprord zmm13,zmm13,0x10
+ vprord zmm14,zmm14,0x10
+ vpaddd zmm10,zmm10,zmm15
+ vpaddd zmm11,zmm11,zmm12
+ vpaddd zmm8,zmm8,zmm13
+ vpaddd zmm9,zmm9,zmm14
+ vpxord zmm5,zmm5,zmm10
+ vpxord zmm6,zmm6,zmm11
+ vpxord zmm7,zmm7,zmm8
+ vpxord zmm4,zmm4,zmm9
+ vprord zmm5,zmm5,0xc
+ vprord zmm6,zmm6,0xc
+ vprord zmm7,zmm7,0xc
+ vprord zmm4,zmm4,0xc
+ vpaddd zmm0,zmm0,zmm18
+ vpaddd zmm1,zmm1,zmm19
+ vpaddd zmm2,zmm2,zmm17
+ vpaddd zmm3,zmm3,zmm20
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm1,zmm1,zmm6
+ vpaddd zmm2,zmm2,zmm7
+ vpaddd zmm3,zmm3,zmm4
+ vpxord zmm15,zmm15,zmm0
+ vpxord zmm12,zmm12,zmm1
+ vpxord zmm13,zmm13,zmm2
+ vpxord zmm14,zmm14,zmm3
+ vprord zmm15,zmm15,0x8
+ vprord zmm12,zmm12,0x8
+ vprord zmm13,zmm13,0x8
+ vprord zmm14,zmm14,0x8
+ vpaddd zmm10,zmm10,zmm15
+ vpaddd zmm11,zmm11,zmm12
+ vpaddd zmm8,zmm8,zmm13
+ vpaddd zmm9,zmm9,zmm14
+ vpxord zmm5,zmm5,zmm10
+ vpxord zmm6,zmm6,zmm11
+ vpxord zmm7,zmm7,zmm8
+ vpxord zmm4,zmm4,zmm9
+ vprord zmm5,zmm5,0x7
+ vprord zmm6,zmm6,0x7
+ vprord zmm7,zmm7,0x7
+ vprord zmm4,zmm4,0x7
+ vpaddd zmm0,zmm0,zmm25
+ vpaddd zmm1,zmm1,zmm27
+ vpaddd zmm2,zmm2,zmm24
+ vpaddd zmm3,zmm3,zmm31
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm1,zmm1,zmm5
+ vpaddd zmm2,zmm2,zmm6
+ vpaddd zmm3,zmm3,zmm7
+ vpxord zmm12,zmm12,zmm0
+ vpxord zmm13,zmm13,zmm1
+ vpxord zmm14,zmm14,zmm2
+ vpxord zmm15,zmm15,zmm3
+ vprord zmm12,zmm12,0x10
+ vprord zmm13,zmm13,0x10
+ vprord zmm14,zmm14,0x10
+ vprord zmm15,zmm15,0x10
+ vpaddd zmm8,zmm8,zmm12
+ vpaddd zmm9,zmm9,zmm13
+ vpaddd zmm10,zmm10,zmm14
+ vpaddd zmm11,zmm11,zmm15
+ vpxord zmm4,zmm4,zmm8
+ vpxord zmm5,zmm5,zmm9
+ vpxord zmm6,zmm6,zmm10
+ vpxord zmm7,zmm7,zmm11
+ vprord zmm4,zmm4,0xc
+ vprord zmm5,zmm5,0xc
+ vprord zmm6,zmm6,0xc
+ vprord zmm7,zmm7,0xc
+ vpaddd zmm0,zmm0,zmm30
+ vpaddd zmm1,zmm1,zmm21
+ vpaddd zmm2,zmm2,zmm28
+ vpaddd zmm3,zmm3,zmm17
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm1,zmm1,zmm5
+ vpaddd zmm2,zmm2,zmm6
+ vpaddd zmm3,zmm3,zmm7
+ vpxord zmm12,zmm12,zmm0
+ vpxord zmm13,zmm13,zmm1
+ vpxord zmm14,zmm14,zmm2
+ vpxord zmm15,zmm15,zmm3
+ vprord zmm12,zmm12,0x8
+ vprord zmm13,zmm13,0x8
+ vprord zmm14,zmm14,0x8
+ vprord zmm15,zmm15,0x8
+ vpaddd zmm8,zmm8,zmm12
+ vpaddd zmm9,zmm9,zmm13
+ vpaddd zmm10,zmm10,zmm14
+ vpaddd zmm11,zmm11,zmm15
+ vpxord zmm4,zmm4,zmm8
+ vpxord zmm5,zmm5,zmm9
+ vpxord zmm6,zmm6,zmm10
+ vpxord zmm7,zmm7,zmm11
+ vprord zmm4,zmm4,0x7
+ vprord zmm5,zmm5,0x7
+ vprord zmm6,zmm6,0x7
+ vprord zmm7,zmm7,0x7
+ vpaddd zmm0,zmm0,zmm29
+ vpaddd zmm1,zmm1,zmm16
+ vpaddd zmm2,zmm2,zmm18
+ vpaddd zmm3,zmm3,zmm20
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm1,zmm1,zmm6
+ vpaddd zmm2,zmm2,zmm7
+ vpaddd zmm3,zmm3,zmm4
+ vpxord zmm15,zmm15,zmm0
+ vpxord zmm12,zmm12,zmm1
+ vpxord zmm13,zmm13,zmm2
+ vpxord zmm14,zmm14,zmm3
+ vprord zmm15,zmm15,0x10
+ vprord zmm12,zmm12,0x10
+ vprord zmm13,zmm13,0x10
+ vprord zmm14,zmm14,0x10
+ vpaddd zmm10,zmm10,zmm15
+ vpaddd zmm11,zmm11,zmm12
+ vpaddd zmm8,zmm8,zmm13
+ vpaddd zmm9,zmm9,zmm14
+ vpxord zmm5,zmm5,zmm10
+ vpxord zmm6,zmm6,zmm11
+ vpxord zmm7,zmm7,zmm8
+ vpxord zmm4,zmm4,zmm9
+ vprord zmm5,zmm5,0xc
+ vprord zmm6,zmm6,0xc
+ vprord zmm7,zmm7,0xc
+ vprord zmm4,zmm4,0xc
+ vpaddd zmm0,zmm0,zmm19
+ vpaddd zmm1,zmm1,zmm26
+ vpaddd zmm2,zmm2,zmm22
+ vpaddd zmm3,zmm3,zmm23
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm1,zmm1,zmm6
+ vpaddd zmm2,zmm2,zmm7
+ vpaddd zmm3,zmm3,zmm4
+ vpxord zmm15,zmm15,zmm0
+ vpxord zmm12,zmm12,zmm1
+ vpxord zmm13,zmm13,zmm2
+ vpxord zmm14,zmm14,zmm3
+ vprord zmm15,zmm15,0x8
+ vprord zmm12,zmm12,0x8
+ vprord zmm13,zmm13,0x8
+ vprord zmm14,zmm14,0x8
+ vpaddd zmm10,zmm10,zmm15
+ vpaddd zmm11,zmm11,zmm12
+ vpaddd zmm8,zmm8,zmm13
+ vpaddd zmm9,zmm9,zmm14
+ vpxord zmm5,zmm5,zmm10
+ vpxord zmm6,zmm6,zmm11
+ vpxord zmm7,zmm7,zmm8
+ vpxord zmm4,zmm4,zmm9
+ vprord zmm5,zmm5,0x7
+ vprord zmm6,zmm6,0x7
+ vprord zmm7,zmm7,0x7
+ vprord zmm4,zmm4,0x7
+ vpaddd zmm0,zmm0,zmm27
+ vpaddd zmm1,zmm1,zmm21
+ vpaddd zmm2,zmm2,zmm17
+ vpaddd zmm3,zmm3,zmm24
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm1,zmm1,zmm5
+ vpaddd zmm2,zmm2,zmm6
+ vpaddd zmm3,zmm3,zmm7
+ vpxord zmm12,zmm12,zmm0
+ vpxord zmm13,zmm13,zmm1
+ vpxord zmm14,zmm14,zmm2
+ vpxord zmm15,zmm15,zmm3
+ vprord zmm12,zmm12,0x10
+ vprord zmm13,zmm13,0x10
+ vprord zmm14,zmm14,0x10
+ vprord zmm15,zmm15,0x10
+ vpaddd zmm8,zmm8,zmm12
+ vpaddd zmm9,zmm9,zmm13
+ vpaddd zmm10,zmm10,zmm14
+ vpaddd zmm11,zmm11,zmm15
+ vpxord zmm4,zmm4,zmm8
+ vpxord zmm5,zmm5,zmm9
+ vpxord zmm6,zmm6,zmm10
+ vpxord zmm7,zmm7,zmm11
+ vprord zmm4,zmm4,0xc
+ vprord zmm5,zmm5,0xc
+ vprord zmm6,zmm6,0xc
+ vprord zmm7,zmm7,0xc
+ vpaddd zmm0,zmm0,zmm31
+ vpaddd zmm1,zmm1,zmm16
+ vpaddd zmm2,zmm2,zmm25
+ vpaddd zmm3,zmm3,zmm22
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm1,zmm1,zmm5
+ vpaddd zmm2,zmm2,zmm6
+ vpaddd zmm3,zmm3,zmm7
+ vpxord zmm12,zmm12,zmm0
+ vpxord zmm13,zmm13,zmm1
+ vpxord zmm14,zmm14,zmm2
+ vpxord zmm15,zmm15,zmm3
+ vprord zmm12,zmm12,0x8
+ vprord zmm13,zmm13,0x8
+ vprord zmm14,zmm14,0x8
+ vprord zmm15,zmm15,0x8
+ vpaddd zmm8,zmm8,zmm12
+ vpaddd zmm9,zmm9,zmm13
+ vpaddd zmm10,zmm10,zmm14
+ vpaddd zmm11,zmm11,zmm15
+ vpxord zmm4,zmm4,zmm8
+ vpxord zmm5,zmm5,zmm9
+ vpxord zmm6,zmm6,zmm10
+ vpxord zmm7,zmm7,zmm11
+ vprord zmm4,zmm4,0x7
+ vprord zmm5,zmm5,0x7
+ vprord zmm6,zmm6,0x7
+ vprord zmm7,zmm7,0x7
+ vpaddd zmm0,zmm0,zmm30
+ vpaddd zmm1,zmm1,zmm18
+ vpaddd zmm2,zmm2,zmm19
+ vpaddd zmm3,zmm3,zmm23
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm1,zmm1,zmm6
+ vpaddd zmm2,zmm2,zmm7
+ vpaddd zmm3,zmm3,zmm4
+ vpxord zmm15,zmm15,zmm0
+ vpxord zmm12,zmm12,zmm1
+ vpxord zmm13,zmm13,zmm2
+ vpxord zmm14,zmm14,zmm3
+ vprord zmm15,zmm15,0x10
+ vprord zmm12,zmm12,0x10
+ vprord zmm13,zmm13,0x10
+ vprord zmm14,zmm14,0x10
+ vpaddd zmm10,zmm10,zmm15
+ vpaddd zmm11,zmm11,zmm12
+ vpaddd zmm8,zmm8,zmm13
+ vpaddd zmm9,zmm9,zmm14
+ vpxord zmm5,zmm5,zmm10
+ vpxord zmm6,zmm6,zmm11
+ vpxord zmm7,zmm7,zmm8
+ vpxord zmm4,zmm4,zmm9
+ vprord zmm5,zmm5,0xc
+ vprord zmm6,zmm6,0xc
+ vprord zmm7,zmm7,0xc
+ vprord zmm4,zmm4,0xc
+ vpaddd zmm0,zmm0,zmm26
+ vpaddd zmm1,zmm1,zmm28
+ vpaddd zmm2,zmm2,zmm20
+ vpaddd zmm3,zmm3,zmm29
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm1,zmm1,zmm6
+ vpaddd zmm2,zmm2,zmm7
+ vpaddd zmm3,zmm3,zmm4
+ vpxord zmm15,zmm15,zmm0
+ vpxord zmm12,zmm12,zmm1
+ vpxord zmm13,zmm13,zmm2
+ vpxord zmm14,zmm14,zmm3
+ vprord zmm15,zmm15,0x8
+ vprord zmm12,zmm12,0x8
+ vprord zmm13,zmm13,0x8
+ vprord zmm14,zmm14,0x8
+ vpaddd zmm10,zmm10,zmm15
+ vpaddd zmm11,zmm11,zmm12
+ vpaddd zmm8,zmm8,zmm13
+ vpaddd zmm9,zmm9,zmm14
+ vpxord zmm5,zmm5,zmm10
+ vpxord zmm6,zmm6,zmm11
+ vpxord zmm7,zmm7,zmm8
+ vpxord zmm4,zmm4,zmm9
+ vprord zmm5,zmm5,0x7
+ vprord zmm6,zmm6,0x7
+ vprord zmm7,zmm7,0x7
+ vprord zmm4,zmm4,0x7
+ vpxord zmm0,zmm0,zmm8
+ vpxord zmm1,zmm1,zmm9
+ vpxord zmm2,zmm2,zmm10
+ vpxord zmm3,zmm3,zmm11
+ vpxord zmm4,zmm4,zmm12
+ vpxord zmm5,zmm5,zmm13
+ vpxord zmm6,zmm6,zmm14
+ vpxord zmm7,zmm7,zmm15
+ vpxord zmm8,zmm8,DWORD PTR [rdi]{1to16}
+ vpxord zmm9,zmm9,DWORD PTR [rdi+0x4]{1to16}
+ vpxord zmm10,zmm10,DWORD PTR [rdi+0x8]{1to16}
+ vpxord zmm11,zmm11,DWORD PTR [rdi+0xc]{1to16}
+ vpxord zmm12,zmm12,DWORD PTR [rdi+0x10]{1to16}
+ vpxord zmm13,zmm13,DWORD PTR [rdi+0x14]{1to16}
+ vpxord zmm14,zmm14,DWORD PTR [rdi+0x18]{1to16}
+ vpxord zmm15,zmm15,DWORD PTR [rdi+0x1c]{1to16}
+ vpunpckldq zmm16,zmm0,zmm1
+ vpunpckhdq zmm17,zmm0,zmm1
+ vpunpckldq zmm18,zmm2,zmm3
+ vpunpckhdq zmm19,zmm2,zmm3
+ vpunpckldq zmm20,zmm4,zmm5
+ vpunpckhdq zmm21,zmm4,zmm5
+ vpunpckldq zmm22,zmm6,zmm7
+ vpunpckhdq zmm23,zmm6,zmm7
+ vpunpckldq zmm24,zmm8,zmm9
+ vpunpckhdq zmm25,zmm8,zmm9
+ vpunpckldq zmm26,zmm10,zmm11
+ vpunpckhdq zmm27,zmm10,zmm11
+ vpunpckldq zmm28,zmm12,zmm13
+ vpunpckhdq zmm29,zmm12,zmm13
+ vpunpckldq zmm30,zmm14,zmm15
+ vpunpckhdq zmm31,zmm14,zmm15
+ vpunpcklqdq zmm0,zmm16,zmm18
+ vpunpckhqdq zmm1,zmm16,zmm18
+ vpunpcklqdq zmm2,zmm17,zmm19
+ vpunpckhqdq zmm3,zmm17,zmm19
+ vpunpcklqdq zmm4,zmm20,zmm22
+ vpunpckhqdq zmm5,zmm20,zmm22
+ vpunpcklqdq zmm6,zmm21,zmm23
+ vpunpckhqdq zmm7,zmm21,zmm23
+ vpunpcklqdq zmm8,zmm24,zmm26
+ vpunpckhqdq zmm9,zmm24,zmm26
+ vpunpcklqdq zmm10,zmm25,zmm27
+ vpunpckhqdq zmm11,zmm25,zmm27
+ vpunpcklqdq zmm12,zmm28,zmm30
+ vpunpckhqdq zmm13,zmm28,zmm30
+ vpunpcklqdq zmm14,zmm29,zmm31
+ vpunpckhqdq zmm15,zmm29,zmm31
+ vshufi32x4 zmm16,zmm0,zmm4,0x88
+ vshufi32x4 zmm17,zmm1,zmm5,0x88
+ vshufi32x4 zmm18,zmm2,zmm6,0x88
+ vshufi32x4 zmm19,zmm3,zmm7,0x88
+ vshufi32x4 zmm20,zmm0,zmm4,0xdd
+ vshufi32x4 zmm21,zmm1,zmm5,0xdd
+ vshufi32x4 zmm22,zmm2,zmm6,0xdd
+ vshufi32x4 zmm23,zmm3,zmm7,0xdd
+ vshufi32x4 zmm24,zmm8,zmm12,0x88
+ vshufi32x4 zmm25,zmm9,zmm13,0x88
+ vshufi32x4 zmm26,zmm10,zmm14,0x88
+ vshufi32x4 zmm27,zmm11,zmm15,0x88
+ vshufi32x4 zmm28,zmm8,zmm12,0xdd
+ vshufi32x4 zmm29,zmm9,zmm13,0xdd
+ vshufi32x4 zmm30,zmm10,zmm14,0xdd
+ vshufi32x4 zmm31,zmm11,zmm15,0xdd
+ vshufi32x4 zmm0,zmm16,zmm24,0x88
+ vshufi32x4 zmm1,zmm17,zmm25,0x88
+ vshufi32x4 zmm2,zmm18,zmm26,0x88
+ vshufi32x4 zmm3,zmm19,zmm27,0x88
+ vshufi32x4 zmm4,zmm20,zmm28,0x88
+ vshufi32x4 zmm5,zmm21,zmm29,0x88
+ vshufi32x4 zmm6,zmm22,zmm30,0x88
+ vshufi32x4 zmm7,zmm23,zmm31,0x88
+ vshufi32x4 zmm8,zmm16,zmm24,0xdd
+ vshufi32x4 zmm9,zmm17,zmm25,0xdd
+ vshufi32x4 zmm10,zmm18,zmm26,0xdd
+ vshufi32x4 zmm11,zmm19,zmm27,0xdd
+ vshufi32x4 zmm12,zmm20,zmm28,0xdd
+ vshufi32x4 zmm13,zmm21,zmm29,0xdd
+ vshufi32x4 zmm14,zmm22,zmm30,0xdd
+ vshufi32x4 zmm15,zmm23,zmm31,0xdd
+ vmovdqu32 ZMMWORD PTR [r9],zmm0
+ vmovdqu32 ZMMWORD PTR [r9+0x40],zmm1
+ vmovdqu32 ZMMWORD PTR [r9+0x80],zmm2
+ vmovdqu32 ZMMWORD PTR [r9+0xc0],zmm3
+ vmovdqu32 ZMMWORD PTR [r9+0x100],zmm4
+ vmovdqu32 ZMMWORD PTR [r9+0x140],zmm5
+ vmovdqu32 ZMMWORD PTR [r9+0x180],zmm6
+ vmovdqu32 ZMMWORD PTR [r9+0x1c0],zmm7
+ vmovdqu32 ZMMWORD PTR [r9+0x200],zmm8
+ vmovdqu32 ZMMWORD PTR [r9+0x240],zmm9
+ vmovdqu32 ZMMWORD PTR [r9+0x280],zmm10
+ vmovdqu32 ZMMWORD PTR [r9+0x2c0],zmm11
+ vmovdqu32 ZMMWORD PTR [r9+0x300],zmm12
+ vmovdqu32 ZMMWORD PTR [r9+0x340],zmm13
+ vmovdqu32 ZMMWORD PTR [r9+0x380],zmm14
+ vmovdqu32 ZMMWORD PTR [r9+0x3c0],zmm15
+ vmovdqa32 zmm0,ZMMWORD PTR [rsp]
+ vmovdqa32 zmm1,ZMMWORD PTR [rsp+0x40]
+ vpaddd zmm2,zmm0,DWORD PTR [ADD16+rip]{1to16}
+ vpcmpltud k1,zmm2,zmm0
+ vpaddd zmm1{k1},zmm1,DWORD PTR [ADD1+rip]{1to16}
+ vmovdqa32 ZMMWORD PTR [rsp],zmm2
+ vmovdqa32 ZMMWORD PTR [rsp+0x40],zmm1
+ add r9,0x400
+ sub r10,0x10
+ cmp r10,0x10
+ jae 3b
+ test r10,r10
+ jne 2f
+9:
+ vzeroupper
+ mov rsp,rbp
+ pop rbp
+ ret
+2:
+ test r10,0x8
+ je 2f
+ vpbroadcastd ymm16,DWORD PTR [rsi]
+ vpbroadcastd ymm17,DWORD PTR [rsi+0x4]
+ vpbroadcastd ymm18,DWORD PTR [rsi+0x8]
+ vpbroadcastd ymm19,DWORD PTR [rsi+0xc]
+ vpbroadcastd ymm20,DWORD PTR [rsi+0x10]
+ vpbroadcastd ymm21,DWORD PTR [rsi+0x14]
+ vpbroadcastd ymm22,DWORD PTR [rsi+0x18]
+ vpbroadcastd ymm23,DWORD PTR [rsi+0x1c]
+ vpbroadcastd ymm24,DWORD PTR [rsi+0x20]
+ vpbroadcastd ymm25,DWORD PTR [rsi+0x24]
+ vpbroadcastd ymm26,DWORD PTR [rsi+0x28]
+ vpbroadcastd ymm27,DWORD PTR [rsi+0x2c]
+ vpbroadcastd ymm28,DWORD PTR [rsi+0x30]
+ vpbroadcastd ymm29,DWORD PTR [rsi+0x34]
+ vpbroadcastd ymm30,DWORD PTR [rsi+0x38]
+ vpbroadcastd ymm31,DWORD PTR [rsi+0x3c]
+ vpbroadcastd ymm0,DWORD PTR [rdi]
+ vpbroadcastd ymm1,DWORD PTR [rdi+0x4]
+ vpbroadcastd ymm2,DWORD PTR [rdi+0x8]
+ vpbroadcastd ymm3,DWORD PTR [rdi+0xc]
+ vpbroadcastd ymm4,DWORD PTR [rdi+0x10]
+ vpbroadcastd ymm5,DWORD PTR [rdi+0x14]
+ vpbroadcastd ymm6,DWORD PTR [rdi+0x18]
+ vpbroadcastd ymm7,DWORD PTR [rdi+0x1c]
+ vpbroadcastd ymm8,DWORD PTR [BLAKE3_IV_0+rip]
+ vpbroadcastd ymm9,DWORD PTR [BLAKE3_IV_1+rip]
+ vpbroadcastd ymm10,DWORD PTR [BLAKE3_IV_2+rip]
+ vpbroadcastd ymm11,DWORD PTR [BLAKE3_IV_3+rip]
+ vmovdqa ymm12,YMMWORD PTR [rsp]
+ vmovdqa ymm13,YMMWORD PTR [rsp+0x40]
+ vpbroadcastd ymm14,edx
+ vpbroadcastd ymm15,r8d
+ vpaddd ymm0,ymm0,ymm16
+ vpaddd ymm1,ymm1,ymm18
+ vpaddd ymm2,ymm2,ymm20
+ vpaddd ymm3,ymm3,ymm22
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm3,ymm3,ymm7
+ vpxord ymm12,ymm12,ymm0
+ vpxord ymm13,ymm13,ymm1
+ vpxord ymm14,ymm14,ymm2
+ vpxord ymm15,ymm15,ymm3
+ vprord ymm12,ymm12,0x10
+ vprord ymm13,ymm13,0x10
+ vprord ymm14,ymm14,0x10
+ vprord ymm15,ymm15,0x10
+ vpaddd ymm8,ymm8,ymm12
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm11,ymm11,ymm15
+ vpxord ymm4,ymm4,ymm8
+ vpxord ymm5,ymm5,ymm9
+ vpxord ymm6,ymm6,ymm10
+ vpxord ymm7,ymm7,ymm11
+ vprord ymm4,ymm4,0xc
+ vprord ymm5,ymm5,0xc
+ vprord ymm6,ymm6,0xc
+ vprord ymm7,ymm7,0xc
+ vpaddd ymm0,ymm0,ymm17
+ vpaddd ymm1,ymm1,ymm19
+ vpaddd ymm2,ymm2,ymm21
+ vpaddd ymm3,ymm3,ymm23
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm3,ymm3,ymm7
+ vpxord ymm12,ymm12,ymm0
+ vpxord ymm13,ymm13,ymm1
+ vpxord ymm14,ymm14,ymm2
+ vpxord ymm15,ymm15,ymm3
+ vprord ymm12,ymm12,0x8
+ vprord ymm13,ymm13,0x8
+ vprord ymm14,ymm14,0x8
+ vprord ymm15,ymm15,0x8
+ vpaddd ymm8,ymm8,ymm12
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm11,ymm11,ymm15
+ vpxord ymm4,ymm4,ymm8
+ vpxord ymm5,ymm5,ymm9
+ vpxord ymm6,ymm6,ymm10
+ vpxord ymm7,ymm7,ymm11
+ vprord ymm4,ymm4,0x7
+ vprord ymm5,ymm5,0x7
+ vprord ymm6,ymm6,0x7
+ vprord ymm7,ymm7,0x7
+ vpaddd ymm0,ymm0,ymm24
+ vpaddd ymm1,ymm1,ymm26
+ vpaddd ymm2,ymm2,ymm28
+ vpaddd ymm3,ymm3,ymm30
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm1,ymm1,ymm6
+ vpaddd ymm2,ymm2,ymm7
+ vpaddd ymm3,ymm3,ymm4
+ vpxord ymm15,ymm15,ymm0
+ vpxord ymm12,ymm12,ymm1
+ vpxord ymm13,ymm13,ymm2
+ vpxord ymm14,ymm14,ymm3
+ vprord ymm15,ymm15,0x10
+ vprord ymm12,ymm12,0x10
+ vprord ymm13,ymm13,0x10
+ vprord ymm14,ymm14,0x10
+ vpaddd ymm10,ymm10,ymm15
+ vpaddd ymm11,ymm11,ymm12
+ vpaddd ymm8,ymm8,ymm13
+ vpaddd ymm9,ymm9,ymm14
+ vpxord ymm5,ymm5,ymm10
+ vpxord ymm6,ymm6,ymm11
+ vpxord ymm7,ymm7,ymm8
+ vpxord ymm4,ymm4,ymm9
+ vprord ymm5,ymm5,0xc
+ vprord ymm6,ymm6,0xc
+ vprord ymm7,ymm7,0xc
+ vprord ymm4,ymm4,0xc
+ vpaddd ymm0,ymm0,ymm25
+ vpaddd ymm1,ymm1,ymm27
+ vpaddd ymm2,ymm2,ymm29
+ vpaddd ymm3,ymm3,ymm31
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm1,ymm1,ymm6
+ vpaddd ymm2,ymm2,ymm7
+ vpaddd ymm3,ymm3,ymm4
+ vpxord ymm15,ymm15,ymm0
+ vpxord ymm12,ymm12,ymm1
+ vpxord ymm13,ymm13,ymm2
+ vpxord ymm14,ymm14,ymm3
+ vprord ymm15,ymm15,0x8
+ vprord ymm12,ymm12,0x8
+ vprord ymm13,ymm13,0x8
+ vprord ymm14,ymm14,0x8
+ vpaddd ymm10,ymm10,ymm15
+ vpaddd ymm11,ymm11,ymm12
+ vpaddd ymm8,ymm8,ymm13
+ vpaddd ymm9,ymm9,ymm14
+ vpxord ymm5,ymm5,ymm10
+ vpxord ymm6,ymm6,ymm11
+ vpxord ymm7,ymm7,ymm8
+ vpxord ymm4,ymm4,ymm9
+ vprord ymm5,ymm5,0x7
+ vprord ymm6,ymm6,0x7
+ vprord ymm7,ymm7,0x7
+ vprord ymm4,ymm4,0x7
+ vpaddd ymm0,ymm0,ymm18
+ vpaddd ymm1,ymm1,ymm19
+ vpaddd ymm2,ymm2,ymm23
+ vpaddd ymm3,ymm3,ymm20
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm3,ymm3,ymm7
+ vpxord ymm12,ymm12,ymm0
+ vpxord ymm13,ymm13,ymm1
+ vpxord ymm14,ymm14,ymm2
+ vpxord ymm15,ymm15,ymm3
+ vprord ymm12,ymm12,0x10
+ vprord ymm13,ymm13,0x10
+ vprord ymm14,ymm14,0x10
+ vprord ymm15,ymm15,0x10
+ vpaddd ymm8,ymm8,ymm12
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm11,ymm11,ymm15
+ vpxord ymm4,ymm4,ymm8
+ vpxord ymm5,ymm5,ymm9
+ vpxord ymm6,ymm6,ymm10
+ vpxord ymm7,ymm7,ymm11
+ vprord ymm4,ymm4,0xc
+ vprord ymm5,ymm5,0xc
+ vprord ymm6,ymm6,0xc
+ vprord ymm7,ymm7,0xc
+ vpaddd ymm0,ymm0,ymm22
+ vpaddd ymm1,ymm1,ymm26
+ vpaddd ymm2,ymm2,ymm16
+ vpaddd ymm3,ymm3,ymm29
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm3,ymm3,ymm7
+ vpxord ymm12,ymm12,ymm0
+ vpxord ymm13,ymm13,ymm1
+ vpxord ymm14,ymm14,ymm2
+ vpxord ymm15,ymm15,ymm3
+ vprord ymm12,ymm12,0x8
+ vprord ymm13,ymm13,0x8
+ vprord ymm14,ymm14,0x8
+ vprord ymm15,ymm15,0x8
+ vpaddd ymm8,ymm8,ymm12
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm11,ymm11,ymm15
+ vpxord ymm4,ymm4,ymm8
+ vpxord ymm5,ymm5,ymm9
+ vpxord ymm6,ymm6,ymm10
+ vpxord ymm7,ymm7,ymm11
+ vprord ymm4,ymm4,0x7
+ vprord ymm5,ymm5,0x7
+ vprord ymm6,ymm6,0x7
+ vprord ymm7,ymm7,0x7
+ vpaddd ymm0,ymm0,ymm17
+ vpaddd ymm1,ymm1,ymm28
+ vpaddd ymm2,ymm2,ymm25
+ vpaddd ymm3,ymm3,ymm31
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm1,ymm1,ymm6
+ vpaddd ymm2,ymm2,ymm7
+ vpaddd ymm3,ymm3,ymm4
+ vpxord ymm15,ymm15,ymm0
+ vpxord ymm12,ymm12,ymm1
+ vpxord ymm13,ymm13,ymm2
+ vpxord ymm14,ymm14,ymm3
+ vprord ymm15,ymm15,0x10
+ vprord ymm12,ymm12,0x10
+ vprord ymm13,ymm13,0x10
+ vprord ymm14,ymm14,0x10
+ vpaddd ymm10,ymm10,ymm15
+ vpaddd ymm11,ymm11,ymm12
+ vpaddd ymm8,ymm8,ymm13
+ vpaddd ymm9,ymm9,ymm14
+ vpxord ymm5,ymm5,ymm10
+ vpxord ymm6,ymm6,ymm11
+ vpxord ymm7,ymm7,ymm8
+ vpxord ymm4,ymm4,ymm9
+ vprord ymm5,ymm5,0xc
+ vprord ymm6,ymm6,0xc
+ vprord ymm7,ymm7,0xc
+ vprord ymm4,ymm4,0xc
+ vpaddd ymm0,ymm0,ymm27
+ vpaddd ymm1,ymm1,ymm21
+ vpaddd ymm2,ymm2,ymm30
+ vpaddd ymm3,ymm3,ymm24
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm1,ymm1,ymm6
+ vpaddd ymm2,ymm2,ymm7
+ vpaddd ymm3,ymm3,ymm4
+ vpxord ymm15,ymm15,ymm0
+ vpxord ymm12,ymm12,ymm1
+ vpxord ymm13,ymm13,ymm2
+ vpxord ymm14,ymm14,ymm3
+ vprord ymm15,ymm15,0x8
+ vprord ymm12,ymm12,0x8
+ vprord ymm13,ymm13,0x8
+ vprord ymm14,ymm14,0x8
+ vpaddd ymm10,ymm10,ymm15
+ vpaddd ymm11,ymm11,ymm12
+ vpaddd ymm8,ymm8,ymm13
+ vpaddd ymm9,ymm9,ymm14
+ vpxord ymm5,ymm5,ymm10
+ vpxord ymm6,ymm6,ymm11
+ vpxord ymm7,ymm7,ymm8
+ vpxord ymm4,ymm4,ymm9
+ vprord ymm5,ymm5,0x7
+ vprord ymm6,ymm6,0x7
+ vprord ymm7,ymm7,0x7
+ vprord ymm4,ymm4,0x7
+ vpaddd ymm0,ymm0,ymm19
+ vpaddd ymm1,ymm1,ymm26
+ vpaddd ymm2,ymm2,ymm29
+ vpaddd ymm3,ymm3,ymm23
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm3,ymm3,ymm7
+ vpxord ymm12,ymm12,ymm0
+ vpxord ymm13,ymm13,ymm1
+ vpxord ymm14,ymm14,ymm2
+ vpxord ymm15,ymm15,ymm3
+ vprord ymm12,ymm12,0x10
+ vprord ymm13,ymm13,0x10
+ vprord ymm14,ymm14,0x10
+ vprord ymm15,ymm15,0x10
+ vpaddd ymm8,ymm8,ymm12
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm11,ymm11,ymm15
+ vpxord ymm4,ymm4,ymm8
+ vpxord ymm5,ymm5,ymm9
+ vpxord ymm6,ymm6,ymm10
+ vpxord ymm7,ymm7,ymm11
+ vprord ymm4,ymm4,0xc
+ vprord ymm5,ymm5,0xc
+ vprord ymm6,ymm6,0xc
+ vprord ymm7,ymm7,0xc
+ vpaddd ymm0,ymm0,ymm20
+ vpaddd ymm1,ymm1,ymm28
+ vpaddd ymm2,ymm2,ymm18
+ vpaddd ymm3,ymm3,ymm30
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm3,ymm3,ymm7
+ vpxord ymm12,ymm12,ymm0
+ vpxord ymm13,ymm13,ymm1
+ vpxord ymm14,ymm14,ymm2
+ vpxord ymm15,ymm15,ymm3
+ vprord ymm12,ymm12,0x8
+ vprord ymm13,ymm13,0x8
+ vprord ymm14,ymm14,0x8
+ vprord ymm15,ymm15,0x8
+ vpaddd ymm8,ymm8,ymm12
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm11,ymm11,ymm15
+ vpxord ymm4,ymm4,ymm8
+ vpxord ymm5,ymm5,ymm9
+ vpxord ymm6,ymm6,ymm10
+ vpxord ymm7,ymm7,ymm11
+ vprord ymm4,ymm4,0x7
+ vprord ymm5,ymm5,0x7
+ vprord ymm6,ymm6,0x7
+ vprord ymm7,ymm7,0x7
+ vpaddd ymm0,ymm0,ymm22
+ vpaddd ymm1,ymm1,ymm25
+ vpaddd ymm2,ymm2,ymm27
+ vpaddd ymm3,ymm3,ymm24
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm1,ymm1,ymm6
+ vpaddd ymm2,ymm2,ymm7
+ vpaddd ymm3,ymm3,ymm4
+ vpxord ymm15,ymm15,ymm0
+ vpxord ymm12,ymm12,ymm1
+ vpxord ymm13,ymm13,ymm2
+ vpxord ymm14,ymm14,ymm3
+ vprord ymm15,ymm15,0x10
+ vprord ymm12,ymm12,0x10
+ vprord ymm13,ymm13,0x10
+ vprord ymm14,ymm14,0x10
+ vpaddd ymm10,ymm10,ymm15
+ vpaddd ymm11,ymm11,ymm12
+ vpaddd ymm8,ymm8,ymm13
+ vpaddd ymm9,ymm9,ymm14
+ vpxord ymm5,ymm5,ymm10
+ vpxord ymm6,ymm6,ymm11
+ vpxord ymm7,ymm7,ymm8
+ vpxord ymm4,ymm4,ymm9
+ vprord ymm5,ymm5,0xc
+ vprord ymm6,ymm6,0xc
+ vprord ymm7,ymm7,0xc
+ vprord ymm4,ymm4,0xc
+ vpaddd ymm0,ymm0,ymm21
+ vpaddd ymm1,ymm1,ymm16
+ vpaddd ymm2,ymm2,ymm31
+ vpaddd ymm3,ymm3,ymm17
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm1,ymm1,ymm6
+ vpaddd ymm2,ymm2,ymm7
+ vpaddd ymm3,ymm3,ymm4
+ vpxord ymm15,ymm15,ymm0
+ vpxord ymm12,ymm12,ymm1
+ vpxord ymm13,ymm13,ymm2
+ vpxord ymm14,ymm14,ymm3
+ vprord ymm15,ymm15,0x8
+ vprord ymm12,ymm12,0x8
+ vprord ymm13,ymm13,0x8
+ vprord ymm14,ymm14,0x8
+ vpaddd ymm10,ymm10,ymm15
+ vpaddd ymm11,ymm11,ymm12
+ vpaddd ymm8,ymm8,ymm13
+ vpaddd ymm9,ymm9,ymm14
+ vpxord ymm5,ymm5,ymm10
+ vpxord ymm6,ymm6,ymm11
+ vpxord ymm7,ymm7,ymm8
+ vpxord ymm4,ymm4,ymm9
+ vprord ymm5,ymm5,0x7
+ vprord ymm6,ymm6,0x7
+ vprord ymm7,ymm7,0x7
+ vprord ymm4,ymm4,0x7
+ vpaddd ymm0,ymm0,ymm26
+ vpaddd ymm1,ymm1,ymm28
+ vpaddd ymm2,ymm2,ymm30
+ vpaddd ymm3,ymm3,ymm29
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm3,ymm3,ymm7
+ vpxord ymm12,ymm12,ymm0
+ vpxord ymm13,ymm13,ymm1
+ vpxord ymm14,ymm14,ymm2
+ vpxord ymm15,ymm15,ymm3
+ vprord ymm12,ymm12,0x10
+ vprord ymm13,ymm13,0x10
+ vprord ymm14,ymm14,0x10
+ vprord ymm15,ymm15,0x10
+ vpaddd ymm8,ymm8,ymm12
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm11,ymm11,ymm15
+ vpxord ymm4,ymm4,ymm8
+ vpxord ymm5,ymm5,ymm9
+ vpxord ymm6,ymm6,ymm10
+ vpxord ymm7,ymm7,ymm11
+ vprord ymm4,ymm4,0xc
+ vprord ymm5,ymm5,0xc
+ vprord ymm6,ymm6,0xc
+ vprord ymm7,ymm7,0xc
+ vpaddd ymm0,ymm0,ymm23
+ vpaddd ymm1,ymm1,ymm25
+ vpaddd ymm2,ymm2,ymm19
+ vpaddd ymm3,ymm3,ymm31
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm3,ymm3,ymm7
+ vpxord ymm12,ymm12,ymm0
+ vpxord ymm13,ymm13,ymm1
+ vpxord ymm14,ymm14,ymm2
+ vpxord ymm15,ymm15,ymm3
+ vprord ymm12,ymm12,0x8
+ vprord ymm13,ymm13,0x8
+ vprord ymm14,ymm14,0x8
+ vprord ymm15,ymm15,0x8
+ vpaddd ymm8,ymm8,ymm12
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm11,ymm11,ymm15
+ vpxord ymm4,ymm4,ymm8
+ vpxord ymm5,ymm5,ymm9
+ vpxord ymm6,ymm6,ymm10
+ vpxord ymm7,ymm7,ymm11
+ vprord ymm4,ymm4,0x7
+ vprord ymm5,ymm5,0x7
+ vprord ymm6,ymm6,0x7
+ vprord ymm7,ymm7,0x7
+ vpaddd ymm0,ymm0,ymm20
+ vpaddd ymm1,ymm1,ymm27
+ vpaddd ymm2,ymm2,ymm21
+ vpaddd ymm3,ymm3,ymm17
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm1,ymm1,ymm6
+ vpaddd ymm2,ymm2,ymm7
+ vpaddd ymm3,ymm3,ymm4
+ vpxord ymm15,ymm15,ymm0
+ vpxord ymm12,ymm12,ymm1
+ vpxord ymm13,ymm13,ymm2
+ vpxord ymm14,ymm14,ymm3
+ vprord ymm15,ymm15,0x10
+ vprord ymm12,ymm12,0x10
+ vprord ymm13,ymm13,0x10
+ vprord ymm14,ymm14,0x10
+ vpaddd ymm10,ymm10,ymm15
+ vpaddd ymm11,ymm11,ymm12
+ vpaddd ymm8,ymm8,ymm13
+ vpaddd ymm9,ymm9,ymm14
+ vpxord ymm5,ymm5,ymm10
+ vpxord ymm6,ymm6,ymm11
+ vpxord ymm7,ymm7,ymm8
+ vpxord ymm4,ymm4,ymm9
+ vprord ymm5,ymm5,0xc
+ vprord ymm6,ymm6,0xc
+ vprord ymm7,ymm7,0xc
+ vprord ymm4,ymm4,0xc
+ vpaddd ymm0,ymm0,ymm16
+ vpaddd ymm1,ymm1,ymm18
+ vpaddd ymm2,ymm2,ymm24
+ vpaddd ymm3,ymm3,ymm22
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm1,ymm1,ymm6
+ vpaddd ymm2,ymm2,ymm7
+ vpaddd ymm3,ymm3,ymm4
+ vpxord ymm15,ymm15,ymm0
+ vpxord ymm12,ymm12,ymm1
+ vpxord ymm13,ymm13,ymm2
+ vpxord ymm14,ymm14,ymm3
+ vprord ymm15,ymm15,0x8
+ vprord ymm12,ymm12,0x8
+ vprord ymm13,ymm13,0x8
+ vprord ymm14,ymm14,0x8
+ vpaddd ymm10,ymm10,ymm15
+ vpaddd ymm11,ymm11,ymm12
+ vpaddd ymm8,ymm8,ymm13
+ vpaddd ymm9,ymm9,ymm14
+ vpxord ymm5,ymm5,ymm10
+ vpxord ymm6,ymm6,ymm11
+ vpxord ymm7,ymm7,ymm8
+ vpxord ymm4,ymm4,ymm9
+ vprord ymm5,ymm5,0x7
+ vprord ymm6,ymm6,0x7
+ vprord ymm7,ymm7,0x7
+ vprord ymm4,ymm4,0x7
+ vpaddd ymm0,ymm0,ymm28
+ vpaddd ymm1,ymm1,ymm25
+ vpaddd ymm2,ymm2,ymm31
+ vpaddd ymm3,ymm3,ymm30
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm3,ymm3,ymm7
+ vpxord ymm12,ymm12,ymm0
+ vpxord ymm13,ymm13,ymm1
+ vpxord ymm14,ymm14,ymm2
+ vpxord ymm15,ymm15,ymm3
+ vprord ymm12,ymm12,0x10
+ vprord ymm13,ymm13,0x10
+ vprord ymm14,ymm14,0x10
+ vprord ymm15,ymm15,0x10
+ vpaddd ymm8,ymm8,ymm12
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm11,ymm11,ymm15
+ vpxord ymm4,ymm4,ymm8
+ vpxord ymm5,ymm5,ymm9
+ vpxord ymm6,ymm6,ymm10
+ vpxord ymm7,ymm7,ymm11
+ vprord ymm4,ymm4,0xc
+ vprord ymm5,ymm5,0xc
+ vprord ymm6,ymm6,0xc
+ vprord ymm7,ymm7,0xc
+ vpaddd ymm0,ymm0,ymm29
+ vpaddd ymm1,ymm1,ymm27
+ vpaddd ymm2,ymm2,ymm26
+ vpaddd ymm3,ymm3,ymm24
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm3,ymm3,ymm7
+ vpxord ymm12,ymm12,ymm0
+ vpxord ymm13,ymm13,ymm1
+ vpxord ymm14,ymm14,ymm2
+ vpxord ymm15,ymm15,ymm3
+ vprord ymm12,ymm12,0x8
+ vprord ymm13,ymm13,0x8
+ vprord ymm14,ymm14,0x8
+ vprord ymm15,ymm15,0x8
+ vpaddd ymm8,ymm8,ymm12
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm11,ymm11,ymm15
+ vpxord ymm4,ymm4,ymm8
+ vpxord ymm5,ymm5,ymm9
+ vpxord ymm6,ymm6,ymm10
+ vpxord ymm7,ymm7,ymm11
+ vprord ymm4,ymm4,0x7
+ vprord ymm5,ymm5,0x7
+ vprord ymm6,ymm6,0x7
+ vprord ymm7,ymm7,0x7
+ vpaddd ymm0,ymm0,ymm23
+ vpaddd ymm1,ymm1,ymm21
+ vpaddd ymm2,ymm2,ymm16
+ vpaddd ymm3,ymm3,ymm22
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm1,ymm1,ymm6
+ vpaddd ymm2,ymm2,ymm7
+ vpaddd ymm3,ymm3,ymm4
+ vpxord ymm15,ymm15,ymm0
+ vpxord ymm12,ymm12,ymm1
+ vpxord ymm13,ymm13,ymm2
+ vpxord ymm14,ymm14,ymm3
+ vprord ymm15,ymm15,0x10
+ vprord ymm12,ymm12,0x10
+ vprord ymm13,ymm13,0x10
+ vprord ymm14,ymm14,0x10
+ vpaddd ymm10,ymm10,ymm15
+ vpaddd ymm11,ymm11,ymm12
+ vpaddd ymm8,ymm8,ymm13
+ vpaddd ymm9,ymm9,ymm14
+ vpxord ymm5,ymm5,ymm10
+ vpxord ymm6,ymm6,ymm11
+ vpxord ymm7,ymm7,ymm8
+ vpxord ymm4,ymm4,ymm9
+ vprord ymm5,ymm5,0xc
+ vprord ymm6,ymm6,0xc
+ vprord ymm7,ymm7,0xc
+ vprord ymm4,ymm4,0xc
+ vpaddd ymm0,ymm0,ymm18
+ vpaddd ymm1,ymm1,ymm19
+ vpaddd ymm2,ymm2,ymm17
+ vpaddd ymm3,ymm3,ymm20
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm1,ymm1,ymm6
+ vpaddd ymm2,ymm2,ymm7
+ vpaddd ymm3,ymm3,ymm4
+ vpxord ymm15,ymm15,ymm0
+ vpxord ymm12,ymm12,ymm1
+ vpxord ymm13,ymm13,ymm2
+ vpxord ymm14,ymm14,ymm3
+ vprord ymm15,ymm15,0x8
+ vprord ymm12,ymm12,0x8
+ vprord ymm13,ymm13,0x8
+ vprord ymm14,ymm14,0x8
+ vpaddd ymm10,ymm10,ymm15
+ vpaddd ymm11,ymm11,ymm12
+ vpaddd ymm8,ymm8,ymm13
+ vpaddd ymm9,ymm9,ymm14
+ vpxord ymm5,ymm5,ymm10
+ vpxord ymm6,ymm6,ymm11
+ vpxord ymm7,ymm7,ymm8
+ vpxord ymm4,ymm4,ymm9
+ vprord ymm5,ymm5,0x7
+ vprord ymm6,ymm6,0x7
+ vprord ymm7,ymm7,0x7
+ vprord ymm4,ymm4,0x7
+ vpaddd ymm0,ymm0,ymm25
+ vpaddd ymm1,ymm1,ymm27
+ vpaddd ymm2,ymm2,ymm24
+ vpaddd ymm3,ymm3,ymm31
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm3,ymm3,ymm7
+ vpxord ymm12,ymm12,ymm0
+ vpxord ymm13,ymm13,ymm1
+ vpxord ymm14,ymm14,ymm2
+ vpxord ymm15,ymm15,ymm3
+ vprord ymm12,ymm12,0x10
+ vprord ymm13,ymm13,0x10
+ vprord ymm14,ymm14,0x10
+ vprord ymm15,ymm15,0x10
+ vpaddd ymm8,ymm8,ymm12
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm11,ymm11,ymm15
+ vpxord ymm4,ymm4,ymm8
+ vpxord ymm5,ymm5,ymm9
+ vpxord ymm6,ymm6,ymm10
+ vpxord ymm7,ymm7,ymm11
+ vprord ymm4,ymm4,0xc
+ vprord ymm5,ymm5,0xc
+ vprord ymm6,ymm6,0xc
+ vprord ymm7,ymm7,0xc
+ vpaddd ymm0,ymm0,ymm30
+ vpaddd ymm1,ymm1,ymm21
+ vpaddd ymm2,ymm2,ymm28
+ vpaddd ymm3,ymm3,ymm17
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm3,ymm3,ymm7
+ vpxord ymm12,ymm12,ymm0
+ vpxord ymm13,ymm13,ymm1
+ vpxord ymm14,ymm14,ymm2
+ vpxord ymm15,ymm15,ymm3
+ vprord ymm12,ymm12,0x8
+ vprord ymm13,ymm13,0x8
+ vprord ymm14,ymm14,0x8
+ vprord ymm15,ymm15,0x8
+ vpaddd ymm8,ymm8,ymm12
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm11,ymm11,ymm15
+ vpxord ymm4,ymm4,ymm8
+ vpxord ymm5,ymm5,ymm9
+ vpxord ymm6,ymm6,ymm10
+ vpxord ymm7,ymm7,ymm11
+ vprord ymm4,ymm4,0x7
+ vprord ymm5,ymm5,0x7
+ vprord ymm6,ymm6,0x7
+ vprord ymm7,ymm7,0x7
+ vpaddd ymm0,ymm0,ymm29
+ vpaddd ymm1,ymm1,ymm16
+ vpaddd ymm2,ymm2,ymm18
+ vpaddd ymm3,ymm3,ymm20
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm1,ymm1,ymm6
+ vpaddd ymm2,ymm2,ymm7
+ vpaddd ymm3,ymm3,ymm4
+ vpxord ymm15,ymm15,ymm0
+ vpxord ymm12,ymm12,ymm1
+ vpxord ymm13,ymm13,ymm2
+ vpxord ymm14,ymm14,ymm3
+ vprord ymm15,ymm15,0x10
+ vprord ymm12,ymm12,0x10
+ vprord ymm13,ymm13,0x10
+ vprord ymm14,ymm14,0x10
+ vpaddd ymm10,ymm10,ymm15
+ vpaddd ymm11,ymm11,ymm12
+ vpaddd ymm8,ymm8,ymm13
+ vpaddd ymm9,ymm9,ymm14
+ vpxord ymm5,ymm5,ymm10
+ vpxord ymm6,ymm6,ymm11
+ vpxord ymm7,ymm7,ymm8
+ vpxord ymm4,ymm4,ymm9
+ vprord ymm5,ymm5,0xc
+ vprord ymm6,ymm6,0xc
+ vprord ymm7,ymm7,0xc
+ vprord ymm4,ymm4,0xc
+ vpaddd ymm0,ymm0,ymm19
+ vpaddd ymm1,ymm1,ymm26
+ vpaddd ymm2,ymm2,ymm22
+ vpaddd ymm3,ymm3,ymm23
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm1,ymm1,ymm6
+ vpaddd ymm2,ymm2,ymm7
+ vpaddd ymm3,ymm3,ymm4
+ vpxord ymm15,ymm15,ymm0
+ vpxord ymm12,ymm12,ymm1
+ vpxord ymm13,ymm13,ymm2
+ vpxord ymm14,ymm14,ymm3
+ vprord ymm15,ymm15,0x8
+ vprord ymm12,ymm12,0x8
+ vprord ymm13,ymm13,0x8
+ vprord ymm14,ymm14,0x8
+ vpaddd ymm10,ymm10,ymm15
+ vpaddd ymm11,ymm11,ymm12
+ vpaddd ymm8,ymm8,ymm13
+ vpaddd ymm9,ymm9,ymm14
+ vpxord ymm5,ymm5,ymm10
+ vpxord ymm6,ymm6,ymm11
+ vpxord ymm7,ymm7,ymm8
+ vpxord ymm4,ymm4,ymm9
+ vprord ymm5,ymm5,0x7
+ vprord ymm6,ymm6,0x7
+ vprord ymm7,ymm7,0x7
+ vprord ymm4,ymm4,0x7
+ vpaddd ymm0,ymm0,ymm27
+ vpaddd ymm1,ymm1,ymm21
+ vpaddd ymm2,ymm2,ymm17
+ vpaddd ymm3,ymm3,ymm24
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm3,ymm3,ymm7
+ vpxord ymm12,ymm12,ymm0
+ vpxord ymm13,ymm13,ymm1
+ vpxord ymm14,ymm14,ymm2
+ vpxord ymm15,ymm15,ymm3
+ vprord ymm12,ymm12,0x10
+ vprord ymm13,ymm13,0x10
+ vprord ymm14,ymm14,0x10
+ vprord ymm15,ymm15,0x10
+ vpaddd ymm8,ymm8,ymm12
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm11,ymm11,ymm15
+ vpxord ymm4,ymm4,ymm8
+ vpxord ymm5,ymm5,ymm9
+ vpxord ymm6,ymm6,ymm10
+ vpxord ymm7,ymm7,ymm11
+ vprord ymm4,ymm4,0xc
+ vprord ymm5,ymm5,0xc
+ vprord ymm6,ymm6,0xc
+ vprord ymm7,ymm7,0xc
+ vpaddd ymm0,ymm0,ymm31
+ vpaddd ymm1,ymm1,ymm16
+ vpaddd ymm2,ymm2,ymm25
+ vpaddd ymm3,ymm3,ymm22
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm3,ymm3,ymm7
+ vpxord ymm12,ymm12,ymm0
+ vpxord ymm13,ymm13,ymm1
+ vpxord ymm14,ymm14,ymm2
+ vpxord ymm15,ymm15,ymm3
+ vprord ymm12,ymm12,0x8
+ vprord ymm13,ymm13,0x8
+ vprord ymm14,ymm14,0x8
+ vprord ymm15,ymm15,0x8
+ vpaddd ymm8,ymm8,ymm12
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm11,ymm11,ymm15
+ vpxord ymm4,ymm4,ymm8
+ vpxord ymm5,ymm5,ymm9
+ vpxord ymm6,ymm6,ymm10
+ vpxord ymm7,ymm7,ymm11
+ vprord ymm4,ymm4,0x7
+ vprord ymm5,ymm5,0x7
+ vprord ymm6,ymm6,0x7
+ vprord ymm7,ymm7,0x7
+ vpaddd ymm0,ymm0,ymm30
+ vpaddd ymm1,ymm1,ymm18
+ vpaddd ymm2,ymm2,ymm19
+ vpaddd ymm3,ymm3,ymm23
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm1,ymm1,ymm6
+ vpaddd ymm2,ymm2,ymm7
+ vpaddd ymm3,ymm3,ymm4
+ vpxord ymm15,ymm15,ymm0
+ vpxord ymm12,ymm12,ymm1
+ vpxord ymm13,ymm13,ymm2
+ vpxord ymm14,ymm14,ymm3
+ vprord ymm15,ymm15,0x10
+ vprord ymm12,ymm12,0x10
+ vprord ymm13,ymm13,0x10
+ vprord ymm14,ymm14,0x10
+ vpaddd ymm10,ymm10,ymm15
+ vpaddd ymm11,ymm11,ymm12
+ vpaddd ymm8,ymm8,ymm13
+ vpaddd ymm9,ymm9,ymm14
+ vpxord ymm5,ymm5,ymm10
+ vpxord ymm6,ymm6,ymm11
+ vpxord ymm7,ymm7,ymm8
+ vpxord ymm4,ymm4,ymm9
+ vprord ymm5,ymm5,0xc
+ vprord ymm6,ymm6,0xc
+ vprord ymm7,ymm7,0xc
+ vprord ymm4,ymm4,0xc
+ vpaddd ymm0,ymm0,ymm26
+ vpaddd ymm1,ymm1,ymm28
+ vpaddd ymm2,ymm2,ymm20
+ vpaddd ymm3,ymm3,ymm29
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm1,ymm1,ymm6
+ vpaddd ymm2,ymm2,ymm7
+ vpaddd ymm3,ymm3,ymm4
+ vpxord ymm15,ymm15,ymm0
+ vpxord ymm12,ymm12,ymm1
+ vpxord ymm13,ymm13,ymm2
+ vpxord ymm14,ymm14,ymm3
+ vprord ymm15,ymm15,0x8
+ vprord ymm12,ymm12,0x8
+ vprord ymm13,ymm13,0x8
+ vprord ymm14,ymm14,0x8
+ vpaddd ymm10,ymm10,ymm15
+ vpaddd ymm11,ymm11,ymm12
+ vpaddd ymm8,ymm8,ymm13
+ vpaddd ymm9,ymm9,ymm14
+ vpxord ymm5,ymm5,ymm10
+ vpxord ymm6,ymm6,ymm11
+ vpxord ymm7,ymm7,ymm8
+ vpxord ymm4,ymm4,ymm9
+ vprord ymm5,ymm5,0x7
+ vprord ymm6,ymm6,0x7
+ vprord ymm7,ymm7,0x7
+ vprord ymm4,ymm4,0x7
+ vpxor ymm0,ymm0,ymm8
+ vpxor ymm1,ymm1,ymm9
+ vpxor ymm2,ymm2,ymm10
+ vpxor ymm3,ymm3,ymm11
+ vpxor ymm4,ymm4,ymm12
+ vpxor ymm5,ymm5,ymm13
+ vpxor ymm6,ymm6,ymm14
+ vpxor ymm7,ymm7,ymm15
+ vpxord ymm8,ymm8,DWORD PTR [rdi]{1to8}
+ vpxord ymm9,ymm9,DWORD PTR [rdi+0x4]{1to8}
+ vpxord ymm10,ymm10,DWORD PTR [rdi+0x8]{1to8}
+ vpxord ymm11,ymm11,DWORD PTR [rdi+0xc]{1to8}
+ vpxord ymm12,ymm12,DWORD PTR [rdi+0x10]{1to8}
+ vpxord ymm13,ymm13,DWORD PTR [rdi+0x14]{1to8}
+ vpxord ymm14,ymm14,DWORD PTR [rdi+0x18]{1to8}
+ vpxord ymm15,ymm15,DWORD PTR [rdi+0x1c]{1to8}
+ vpunpckldq ymm16,ymm0,ymm1
+ vpunpckhdq ymm17,ymm0,ymm1
+ vpunpckldq ymm18,ymm2,ymm3
+ vpunpckhdq ymm19,ymm2,ymm3
+ vpunpckldq ymm20,ymm4,ymm5
+ vpunpckhdq ymm21,ymm4,ymm5
+ vpunpckldq ymm22,ymm6,ymm7
+ vpunpckhdq ymm23,ymm6,ymm7
+ vpunpckldq ymm24,ymm8,ymm9
+ vpunpckhdq ymm25,ymm8,ymm9
+ vpunpckldq ymm26,ymm10,ymm11
+ vpunpckhdq ymm27,ymm10,ymm11
+ vpunpckldq ymm28,ymm12,ymm13
+ vpunpckhdq ymm29,ymm12,ymm13
+ vpunpckldq ymm30,ymm14,ymm15
+ vpunpckhdq ymm31,ymm14,ymm15
+ vpunpcklqdq ymm0,ymm16,ymm18
+ vpunpckhqdq ymm1,ymm16,ymm18
+ vpunpcklqdq ymm2,ymm17,ymm19
+ vpunpckhqdq ymm3,ymm17,ymm19
+ vpunpcklqdq ymm4,ymm20,ymm22
+ vpunpckhqdq ymm5,ymm20,ymm22
+ vpunpcklqdq ymm6,ymm21,ymm23
+ vpunpckhqdq ymm7,ymm21,ymm23
+ vpunpcklqdq ymm8,ymm24,ymm26
+ vpunpckhqdq ymm9,ymm24,ymm26
+ vpunpcklqdq ymm10,ymm25,ymm27
+ vpunpckhqdq ymm11,ymm25,ymm27
+ vpunpcklqdq ymm12,ymm28,ymm30
+ vpunpckhqdq ymm13,ymm28,ymm30
+ vpunpcklqdq ymm14,ymm29,ymm31
+ vpunpckhqdq ymm15,ymm29,ymm31
+ vshufi32x4 ymm16,ymm0,ymm4,0x0
+ vshufi32x4 ymm17,ymm8,ymm12,0x0
+ vshufi32x4 ymm18,ymm1,ymm5,0x0
+ vshufi32x4 ymm19,ymm9,ymm13,0x0
+ vshufi32x4 ymm20,ymm2,ymm6,0x0
+ vshufi32x4 ymm21,ymm10,ymm14,0x0
+ vshufi32x4 ymm22,ymm3,ymm7,0x0
+ vshufi32x4 ymm23,ymm11,ymm15,0x0
+ vshufi32x4 ymm24,ymm0,ymm4,0x3
+ vshufi32x4 ymm25,ymm8,ymm12,0x3
+ vshufi32x4 ymm26,ymm1,ymm5,0x3
+ vshufi32x4 ymm27,ymm9,ymm13,0x3
+ vshufi32x4 ymm28,ymm2,ymm6,0x3
+ vshufi32x4 ymm29,ymm10,ymm14,0x3
+ vshufi32x4 ymm30,ymm3,ymm7,0x3
+ vshufi32x4 ymm31,ymm11,ymm15,0x3
+ vmovdqu32 YMMWORD PTR [r9],ymm16
+ vmovdqu32 YMMWORD PTR [r9+0x20],ymm17
+ vmovdqu32 YMMWORD PTR [r9+0x40],ymm18
+ vmovdqu32 YMMWORD PTR [r9+0x60],ymm19
+ vmovdqu32 YMMWORD PTR [r9+0x80],ymm20
+ vmovdqu32 YMMWORD PTR [r9+0xa0],ymm21
+ vmovdqu32 YMMWORD PTR [r9+0xc0],ymm22
+ vmovdqu32 YMMWORD PTR [r9+0xe0],ymm23
+ vmovdqu32 YMMWORD PTR [r9+0x100],ymm24
+ vmovdqu32 YMMWORD PTR [r9+0x120],ymm25
+ vmovdqu32 YMMWORD PTR [r9+0x140],ymm26
+ vmovdqu32 YMMWORD PTR [r9+0x160],ymm27
+ vmovdqu32 YMMWORD PTR [r9+0x180],ymm28
+ vmovdqu32 YMMWORD PTR [r9+0x1a0],ymm29
+ vmovdqu32 YMMWORD PTR [r9+0x1c0],ymm30
+ vmovdqu32 YMMWORD PTR [r9+0x1e0],ymm31
+ vmovdqa ymm0,YMMWORD PTR [rsp+0x20]
+ vmovdqa ymm1,YMMWORD PTR [rsp+0x60]
+ vmovdqa YMMWORD PTR [rsp],ymm0
+ vmovdqa YMMWORD PTR [rsp+0x40],ymm1
+ add r9,0x200
+ sub r10,0x8
+2:
+ test r10,0x4
+ je 2f
+ vbroadcasti32x4 zmm0,XMMWORD PTR [rdi]
+ vbroadcasti32x4 zmm1,XMMWORD PTR [rdi+0x10]
+ vbroadcasti32x4 zmm2,XMMWORD PTR [BLAKE3_IV+rip]
+ vmovdqa xmm12,XMMWORD PTR [rsp]
+ vmovdqa xmm13,XMMWORD PTR [rsp+0x40]
+ vpunpckldq xmm14,xmm12,xmm13
+ vpunpckhdq xmm15,xmm12,xmm13
+ vpermq ymm14,ymm14,0xdc
+ vpermq ymm15,ymm15,0xdc
+ vpbroadcastd zmm12,edx
+ vinserti64x4 zmm13,zmm14,ymm15,0x1
+ mov eax,0x4444
+ kmovw k2,eax
+ vpblendmd zmm13{k2},zmm13,zmm12
+ vpbroadcastd zmm15,r8d
+ mov eax,0x8888
+ kmovw k4,eax
+ vpblendmd zmm3{k4},zmm13,zmm15
+ mov eax,0xaaaa
+ kmovw k3,eax
+ vbroadcasti32x4 zmm8,XMMWORD PTR [rsi]
+ vbroadcasti32x4 zmm9,XMMWORD PTR [rsi+0x10]
+ vshufps zmm4,zmm8,zmm9,0x88
+ vshufps zmm5,zmm8,zmm9,0xdd
+ vbroadcasti32x4 zmm8,XMMWORD PTR [rsi+0x20]
+ vbroadcasti32x4 zmm9,XMMWORD PTR [rsi+0x30]
+ vshufps zmm6,zmm8,zmm9,0x88
+ vshufps zmm7,zmm8,zmm9,0xdd
+ vpshufd zmm6,zmm6,0x93
+ vpshufd zmm7,zmm7,0x93
+ mov al,0x7
+3:
+ vpaddd zmm0,zmm0,zmm4
+ vpaddd zmm0,zmm0,zmm1
+ vpxord zmm3,zmm3,zmm0
+ vprord zmm3,zmm3,0x10
+ vpaddd zmm2,zmm2,zmm3
+ vpxord zmm1,zmm1,zmm2
+ vprord zmm1,zmm1,0xc
+ vpaddd zmm0,zmm0,zmm5
+ vpaddd zmm0,zmm0,zmm1
+ vpxord zmm3,zmm3,zmm0
+ vprord zmm3,zmm3,0x8
+ vpaddd zmm2,zmm2,zmm3
+ vpxord zmm1,zmm1,zmm2
+ vprord zmm1,zmm1,0x7
+ vpshufd zmm0,zmm0,0x93
+ vpshufd zmm3,zmm3,0x4e
+ vpshufd zmm2,zmm2,0x39
+ vpaddd zmm0,zmm0,zmm6
+ vpaddd zmm0,zmm0,zmm1
+ vpxord zmm3,zmm3,zmm0
+ vprord zmm3,zmm3,0x10
+ vpaddd zmm2,zmm2,zmm3
+ vpxord zmm1,zmm1,zmm2
+ vprord zmm1,zmm1,0xc
+ vpaddd zmm0,zmm0,zmm7
+ vpaddd zmm0,zmm0,zmm1
+ vpxord zmm3,zmm3,zmm0
+ vprord zmm3,zmm3,0x8
+ vpaddd zmm2,zmm2,zmm3
+ vpxord zmm1,zmm1,zmm2
+ vprord zmm1,zmm1,0x7
+ vpshufd zmm0,zmm0,0x39
+ vpshufd zmm3,zmm3,0x4e
+ vpshufd zmm2,zmm2,0x93
+ dec al
+ je 3f
+ vshufps zmm8,zmm4,zmm5,0xd6
+ vpshufd zmm9,zmm4,0xf
+ vpshufd zmm4,zmm8,0x39
+ vshufps zmm8,zmm6,zmm7,0xfa
+ vpblendmd zmm9{k3},zmm9,zmm8
+ vpunpcklqdq zmm8,zmm7,zmm5
+ vpblendmd zmm8{k4},zmm8,zmm6
+ vpshufd zmm8,zmm8,0x78
+ vpunpckhdq zmm5,zmm5,zmm7
+ vpunpckldq zmm6,zmm6,zmm5
+ vpshufd zmm7,zmm6,0x1e
+ vmovdqa32 zmm5,zmm9
+ vmovdqa32 zmm6,zmm8
+ jmp 3b
+3:
+ vpxord zmm0,zmm0,zmm2
+ vpxord zmm1,zmm1,zmm3
+ vbroadcasti32x4 zmm8,XMMWORD PTR [rdi]
+ vbroadcasti32x4 zmm9,XMMWORD PTR [rdi+0x10]
+ vpxord zmm2,zmm2,zmm8
+ vpxord zmm3,zmm3,zmm9
+ vmovdqu XMMWORD PTR [r9],xmm0
+ vmovdqu XMMWORD PTR [r9+0x10],xmm1
+ vmovdqu XMMWORD PTR [r9+0x20],xmm2
+ vmovdqu XMMWORD PTR [r9+0x30],xmm3
+ vextracti128 XMMWORD PTR [r9+0x40],ymm0,0x1
+ vextracti128 XMMWORD PTR [r9+0x50],ymm1,0x1
+ vextracti128 XMMWORD PTR [r9+0x60],ymm2,0x1
+ vextracti128 XMMWORD PTR [r9+0x70],ymm3,0x1
+ vextracti32x4 XMMWORD PTR [r9+0x80],zmm0,0x2
+ vextracti32x4 XMMWORD PTR [r9+0x90],zmm1,0x2
+ vextracti32x4 XMMWORD PTR [r9+0xa0],zmm2,0x2
+ vextracti32x4 XMMWORD PTR [r9+0xb0],zmm3,0x2
+ vextracti32x4 XMMWORD PTR [r9+0xc0],zmm0,0x3
+ vextracti32x4 XMMWORD PTR [r9+0xd0],zmm1,0x3
+ vextracti32x4 XMMWORD PTR [r9+0xe0],zmm2,0x3
+ vextracti32x4 XMMWORD PTR [r9+0xf0],zmm3,0x3
+ vmovdqa xmm0,XMMWORD PTR [rsp+0x10]
+ vmovdqa xmm1,XMMWORD PTR [rsp+0x50]
+ vmovdqa XMMWORD PTR [rsp],xmm0
+ vmovdqa XMMWORD PTR [rsp+0x40],xmm1
+ add r9,0x100
+ sub r10,0x4
+2:
+ test r10,0x2
+ je 2f
+ vbroadcasti128 ymm0,XMMWORD PTR [rdi]
+ vbroadcasti128 ymm1,XMMWORD PTR [rdi+0x10]
+ vmovd xmm13,DWORD PTR [rsp]
+ vpinsrd xmm13,xmm13,DWORD PTR [rsp+0x40],0x1
+ vpinsrd xmm13,xmm13,edx,0x2
+ vmovd xmm14,DWORD PTR [rsp+0x4]
+ vpinsrd xmm14,xmm14,DWORD PTR [rsp+0x44],0x1
+ vpinsrd xmm14,xmm14,edx,0x2
+ vinserti128 ymm13,ymm13,xmm14,0x1
+ vbroadcasti128 ymm2,XMMWORD PTR [BLAKE3_IV+rip]
+ vpbroadcastd ymm8,r8d
+ vpblendd ymm3,ymm13,ymm8,0x88
+ vbroadcasti128 ymm8,XMMWORD PTR [rsi]
+ vbroadcasti128 ymm9,XMMWORD PTR [rsi+0x10]
+ vshufps ymm4,ymm8,ymm9,0x88
+ vshufps ymm5,ymm8,ymm9,0xdd
+ vbroadcasti128 ymm8,XMMWORD PTR [rsi+0x20]
+ vbroadcasti128 ymm9,XMMWORD PTR [rsi+0x30]
+ vshufps ymm6,ymm8,ymm9,0x88
+ vshufps ymm7,ymm8,ymm9,0xdd
+ vpshufd ymm6,ymm6,0x93
+ vpshufd ymm7,ymm7,0x93
+ mov al,0x7
+3:
+ vpaddd ymm0,ymm0,ymm4
+ vpaddd ymm0,ymm0,ymm1
+ vpxord ymm3,ymm3,ymm0
+ vprord ymm3,ymm3,0x10
+ vpaddd ymm2,ymm2,ymm3
+ vpxord ymm1,ymm1,ymm2
+ vprord ymm1,ymm1,0xc
+ vpaddd ymm0,ymm0,ymm5
+ vpaddd ymm0,ymm0,ymm1
+ vpxord ymm3,ymm3,ymm0
+ vprord ymm3,ymm3,0x8
+ vpaddd ymm2,ymm2,ymm3
+ vpxord ymm1,ymm1,ymm2
+ vprord ymm1,ymm1,0x7
+ vpshufd ymm0,ymm0,0x93
+ vpshufd ymm3,ymm3,0x4e
+ vpshufd ymm2,ymm2,0x39
+ vpaddd ymm0,ymm0,ymm6
+ vpaddd ymm0,ymm0,ymm1
+ vpxord ymm3,ymm3,ymm0
+ vprord ymm3,ymm3,0x10
+ vpaddd ymm2,ymm2,ymm3
+ vpxord ymm1,ymm1,ymm2
+ vprord ymm1,ymm1,0xc
+ vpaddd ymm0,ymm0,ymm7
+ vpaddd ymm0,ymm0,ymm1
+ vpxord ymm3,ymm3,ymm0
+ vprord ymm3,ymm3,0x8
+ vpaddd ymm2,ymm2,ymm3
+ vpxord ymm1,ymm1,ymm2
+ vprord ymm1,ymm1,0x7
+ vpshufd ymm0,ymm0,0x39
+ vpshufd ymm3,ymm3,0x4e
+ vpshufd ymm2,ymm2,0x93
+ dec al
+ je 3f
+ vshufps ymm8,ymm4,ymm5,0xd6
+ vpshufd ymm9,ymm4,0xf
+ vpshufd ymm4,ymm8,0x39
+ vshufps ymm8,ymm6,ymm7,0xfa
+ vpblendd ymm9,ymm9,ymm8,0xaa
+ vpunpcklqdq ymm8,ymm7,ymm5
+ vpblendd ymm8,ymm8,ymm6,0x88
+ vpshufd ymm8,ymm8,0x78
+ vpunpckhdq ymm5,ymm5,ymm7
+ vpunpckldq ymm6,ymm6,ymm5
+ vpshufd ymm7,ymm6,0x1e
+ vmovdqa ymm5,ymm9
+ vmovdqa ymm6,ymm8
+ jmp 3b
+3:
+ vpxor ymm0,ymm0,ymm2
+ vpxor ymm1,ymm1,ymm3
+ vbroadcasti128 ymm8,XMMWORD PTR [rdi]
+ vbroadcasti128 ymm9,XMMWORD PTR [rdi+0x10]
+ vpxor ymm2,ymm2,ymm8
+ vpxor ymm3,ymm3,ymm9
+ vmovdqu XMMWORD PTR [r9],xmm0
+ vmovdqu XMMWORD PTR [r9+0x10],xmm1
+ vmovdqu XMMWORD PTR [r9+0x20],xmm2
+ vmovdqu XMMWORD PTR [r9+0x30],xmm3
+ vextracti128 XMMWORD PTR [r9+0x40],ymm0,0x1
+ vextracti128 XMMWORD PTR [r9+0x50],ymm1,0x1
+ vextracti128 XMMWORD PTR [r9+0x60],ymm2,0x1
+ vextracti128 XMMWORD PTR [r9+0x70],ymm3,0x1
+ vmovdqu xmm0,XMMWORD PTR [rsp+0x8]
+ vmovdqu xmm1,XMMWORD PTR [rsp+0x48]
+ vmovdqa XMMWORD PTR [rsp],xmm0
+ vmovdqa XMMWORD PTR [rsp+0x40],xmm1
+ add r9,0x80
+ sub r10,0x2
+2:
+ test r10,0x1
+ je 9b
+ vmovdqu xmm0,XMMWORD PTR [rdi]
+ vmovdqu xmm1,XMMWORD PTR [rdi+0x10]
+ vmovd xmm14,DWORD PTR [rsp]
+ vpinsrd xmm14,xmm14,DWORD PTR [rsp+0x40],0x1
+ vpinsrd xmm14,xmm14,edx,0x2
+ vmovdqa xmm2,XMMWORD PTR [BLAKE3_IV+rip]
+ vpinsrd xmm3,xmm14,r8d,0x3
+ vmovups xmm8,XMMWORD PTR [rsi]
+ vmovups xmm9,XMMWORD PTR [rsi+0x10]
+ vshufps xmm4,xmm8,xmm9,0x88
+ vshufps xmm5,xmm8,xmm9,0xdd
+ vmovups xmm8,XMMWORD PTR [rsi+0x20]
+ vmovups xmm9,XMMWORD PTR [rsi+0x30]
+ vshufps xmm6,xmm8,xmm9,0x88
+ vshufps xmm7,xmm8,xmm9,0xdd
+ vpshufd xmm6,xmm6,0x93
+ vpshufd xmm7,xmm7,0x93
+ mov al,0x7
+3:
+ vpaddd xmm0,xmm0,xmm4
+ vpaddd xmm0,xmm0,xmm1
+ vpxord xmm3,xmm3,xmm0
+ vprord xmm3,xmm3,0x10
+ vpaddd xmm2,xmm2,xmm3
+ vpxord xmm1,xmm1,xmm2
+ vprord xmm1,xmm1,0xc
+ vpaddd xmm0,xmm0,xmm5
+ vpaddd xmm0,xmm0,xmm1
+ vpxord xmm3,xmm3,xmm0
+ vprord xmm3,xmm3,0x8
+ vpaddd xmm2,xmm2,xmm3
+ vpxord xmm1,xmm1,xmm2
+ vprord xmm1,xmm1,0x7
+ vpshufd xmm0,xmm0,0x93
+ vpshufd xmm3,xmm3,0x4e
+ vpshufd xmm2,xmm2,0x39
+ vpaddd xmm0,xmm0,xmm6
+ vpaddd xmm0,xmm0,xmm1
+ vpxord xmm3,xmm3,xmm0
+ vprord xmm3,xmm3,0x10
+ vpaddd xmm2,xmm2,xmm3
+ vpxord xmm1,xmm1,xmm2
+ vprord xmm1,xmm1,0xc
+ vpaddd xmm0,xmm0,xmm7
+ vpaddd xmm0,xmm0,xmm1
+ vpxord xmm3,xmm3,xmm0
+ vprord xmm3,xmm3,0x8
+ vpaddd xmm2,xmm2,xmm3
+ vpxord xmm1,xmm1,xmm2
+ vprord xmm1,xmm1,0x7
+ vpshufd xmm0,xmm0,0x39
+ vpshufd xmm3,xmm3,0x4e
+ vpshufd xmm2,xmm2,0x93
+ dec al
+ je 3f
+ vshufps xmm8,xmm4,xmm5,0xd6
+ vpshufd xmm9,xmm4,0xf
+ vpshufd xmm4,xmm8,0x39
+ vshufps xmm8,xmm6,xmm7,0xfa
+ vpblendd xmm9,xmm9,xmm8,0xaa
+ vpunpcklqdq xmm8,xmm7,xmm5
+ vpblendd xmm8,xmm8,xmm6,0x88
+ vpshufd xmm8,xmm8,0x78
+ vpunpckhdq xmm5,xmm5,xmm7
+ vpunpckldq xmm6,xmm6,xmm5
+ vpshufd xmm7,xmm6,0x1e
+ vmovdqa xmm5,xmm9
+ vmovdqa xmm6,xmm8
+ jmp 3b
+3:
+ vpxor xmm0,xmm0,xmm2
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm2,xmm2,XMMWORD PTR [rdi]
+ vpxor xmm3,xmm3,XMMWORD PTR [rdi+0x10]
+ vmovdqu XMMWORD PTR [r9],xmm0
+ vmovdqu XMMWORD PTR [r9+0x10],xmm1
+ vmovdqu XMMWORD PTR [r9+0x20],xmm2
+ vmovdqu XMMWORD PTR [r9+0x30],xmm3
+ jmp 9b
+
+
#ifdef __APPLE__
.static_data
#else
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_gnu.S b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_gnu.S
index 53c586141fbef..9419f4b5aae56 100644
--- a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_gnu.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_gnu.S
@@ -2589,7 +2589,7 @@ blake3_compress_xof_avx512:
add rsp, 72
ret
-.section .rodata
+.section .rdata
.p2align 6
INDEX0:
.long 0, 1, 2, 3, 16, 17, 18, 19
diff --git a/llvm/lib/Support/BLAKE3/blake3_dispatch.c b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
index e96e714225f41..d00580fe35195 100644
--- a/llvm/lib/Support/BLAKE3/blake3_dispatch.c
+++ b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
@@ -4,16 +4,46 @@
#include "blake3_impl.h"
+#if defined(_MSC_VER)
+#include <Windows.h>
+#endif
+
#if defined(IS_X86)
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__)
#include <immintrin.h>
#else
-#error "Unimplemented!"
+#undef IS_X86 /* Unimplemented! */
#endif
#endif
+#if !defined(BLAKE3_ATOMICS)
+#if defined(__has_include)
+#if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
+#define BLAKE3_ATOMICS 1
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* defined(__has_include) */
+#endif /* BLAKE3_ATOMICS */
+
+#if BLAKE3_ATOMICS
+#define ATOMIC_INT _Atomic int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#elif defined(_MSC_VER)
+#define ATOMIC_INT LONG
+#define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
+#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
+#else
+#define ATOMIC_INT int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#endif
+
#define MAYBE_UNUSED(x) (void)((x))
#if defined(IS_X86)
@@ -76,7 +106,7 @@ enum cpu_feature {
#if !defined(BLAKE3_TESTING)
static /* Allow the variable to be controlled manually for testing */
#endif
- enum cpu_feature g_cpu_features = UNDEFINED;
+ ATOMIC_INT g_cpu_features = UNDEFINED;
LLVM_ATTRIBUTE_USED
#if !defined(BLAKE3_TESTING)
@@ -85,14 +115,16 @@ static
enum cpu_feature
get_cpu_features(void) {
- if (g_cpu_features != UNDEFINED) {
- return g_cpu_features;
+ /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
+ enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
+ if (features != UNDEFINED) {
+ return features;
} else {
#if defined(IS_X86)
uint32_t regs[4] = {0};
uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3];
(void)edx;
- enum cpu_feature features = 0;
+ features = 0;
cpuid(regs, 0);
const int max_id = *eax;
cpuid(regs, 1);
@@ -102,7 +134,7 @@ static
if (*edx & (1UL << 26))
features |= SSE2;
#endif
- if (*ecx & (1UL << 0))
+ if (*ecx & (1UL << 9))
features |= SSSE3;
if (*ecx & (1UL << 19))
features |= SSE41;
@@ -125,7 +157,7 @@ static
}
}
}
- g_cpu_features = features;
+ ATOMIC_STORE(g_cpu_features, features);
return features;
#else
/* How to detect NEON? */
@@ -192,6 +224,30 @@ void blake3_compress_xof(const uint32_t cv[8],
blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
}
+
+void blake3_xof_many(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags,
+ uint8_t out[64], size_t outblocks) {
+ if (outblocks == 0) {
+ // The current assembly implementation always outputs at least 1 block.
+ return;
+ }
+#if defined(IS_X86)
+ const enum cpu_feature features = get_cpu_features();
+ MAYBE_UNUSED(features);
+#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512)
+ if (features & AVX512VL) {
+ blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
+ return;
+ }
+#endif
+#endif
+ for(size_t i = 0; i < outblocks; ++i) {
+ blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i);
+ }
+}
+
void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
diff --git a/llvm/lib/Support/BLAKE3/blake3_impl.h b/llvm/lib/Support/BLAKE3/blake3_impl.h
index c679ecde4c4e9..deed079e468a5 100644
--- a/llvm/lib/Support/BLAKE3/blake3_impl.h
+++ b/llvm/lib/Support/BLAKE3/blake3_impl.h
@@ -13,6 +13,8 @@
#include "llvm_blake3_prefix.h"
+#define BLAKE3_PRIVATE
+
// internal flags
enum blake3_flags {
CHUNK_START = 1 << 0,
@@ -32,7 +34,7 @@ enum blake3_flags {
#define INLINE static inline __attribute__((always_inline))
#endif
-#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC)
#define IS_X86
#define IS_X86_64
#endif
@@ -42,7 +44,7 @@ enum blake3_flags {
#define IS_X86_32
#endif
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
#define IS_AARCH64
#endif
@@ -54,10 +56,13 @@ enum blake3_flags {
#endif
#if !defined(BLAKE3_USE_NEON)
- // If BLAKE3_USE_NEON not manually set, autodetect based on
- // AArch64ness and endianness.
- #if defined(IS_AARCH64) && !defined(__ARM_BIG_ENDIAN)
- #define BLAKE3_USE_NEON 1
+ // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
+ #if defined(IS_AARCH64)
+ #if defined(__ARM_BIG_ENDIAN)
+ #define BLAKE3_USE_NEON 0
+ #else
+ #define BLAKE3_USE_NEON 1
+ #endif
#else
#define BLAKE3_USE_NEON 0
#endif
@@ -93,7 +98,7 @@ static const uint8_t MSG_SCHEDULE[7][16] = {
/* x is assumed to be nonzero. */
static unsigned int highest_one(uint64_t x) {
#if defined(__GNUC__) || defined(__clang__)
- return 63 ^ __builtin_clzll(x);
+ return 63 ^ (unsigned int)__builtin_clzll(x);
#elif defined(_MSC_VER) && defined(IS_X86_64)
unsigned long index;
_BitScanReverse64(&index, x);
@@ -123,7 +128,7 @@ static unsigned int highest_one(uint64_t x) {
// Count the number of 1 bits.
INLINE unsigned int popcnt(uint64_t x) {
#if defined(__GNUC__) || defined(__clang__)
- return __builtin_popcountll(x);
+ return (unsigned int)__builtin_popcountll(x);
#else
unsigned int count = 0;
while (x != 0) {
@@ -164,6 +169,13 @@ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
key_words[7] = load32(&key[7 * 4]);
}
+INLINE void load_block_words(const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint32_t block_words[16]) {
+ for (size_t i = 0; i < 16; i++) {
+ block_words[i] = load32(&block[i * 4]);
+ }
+}
+
INLINE void store32(void *dst, uint32_t w) {
uint8_t *p = (uint8_t *)dst;
p[0] = (uint8_t)(w >> 0);
@@ -195,6 +207,12 @@ void blake3_compress_xof(const uint32_t cv[8],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t out[64]);
+LLVM_LIBRARY_VISIBILITY
+void blake3_xof_many(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags,
+ uint8_t out[64], size_t outblocks);
+
LLVM_LIBRARY_VISIBILITY
void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8], uint64_t counter,
@@ -204,6 +222,22 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
LLVM_LIBRARY_VISIBILITY
size_t blake3_simd_degree(void);
+BLAKE3_PRIVATE size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
+ const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags,
+ uint8_t *out, bool use_tbb);
+
+#if defined(BLAKE3_USE_TBB)
+BLAKE3_PRIVATE void blake3_compress_subtree_wide_join_tbb(
+ // shared params
+ const uint32_t key[8], uint8_t flags, bool use_tbb,
+ // left-hand side params
+ const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter,
+ uint8_t *l_cvs, size_t *l_n,
+ // right-hand side params
+ const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter,
+ uint8_t *r_cvs, size_t *r_n) NOEXCEPT;
+#endif
// Declarations for implementation-specific functions.
LLVM_LIBRARY_VISIBILITY
@@ -289,6 +323,14 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
+
+#if !defined(_WIN32)
+LLVM_LIBRARY_VISIBILITY
+void blake3_xof_many_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags,
+ uint8_t* out, size_t outblocks);
+#endif
#endif
#endif
diff --git a/llvm/lib/Support/BLAKE3/blake3_neon.c b/llvm/lib/Support/BLAKE3/blake3_neon.c
index 380bbfc3e4665..9629e10836864 100644
--- a/llvm/lib/Support/BLAKE3/blake3_neon.c
+++ b/llvm/lib/Support/BLAKE3/blake3_neon.c
@@ -12,14 +12,12 @@
INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
// vld1q_u32 has alignment requirements. Don't use it.
- uint32x4_t x;
- memcpy(&x, src, 16);
- return x;
+ return vreinterpretq_u32_u8(vld1q_u8(src));
}
INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
// vst1q_u32 has alignment requirements. Don't use it.
- memcpy(dest, &src, 16);
+ vst1q_u8(dest, vreinterpretq_u8_u32(src));
}
INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
@@ -38,19 +36,36 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
}
INLINE uint32x4_t rot16_128(uint32x4_t x) {
- return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+ // The straightforward implementation would be two shifts and an or, but that's
+ // slower on microarchitectures we've tested. See
+ // https://github.com/BLAKE3-team/BLAKE3/pull/319.
+ // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+ return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
}
INLINE uint32x4_t rot12_128(uint32x4_t x) {
- return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+ // See comment in rot16_128.
+ // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+ return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
}
INLINE uint32x4_t rot8_128(uint32x4_t x) {
- return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+ // See comment in rot16_128.
+ // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+#if defined(__clang__)
+ return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
+#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
+ static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
+ return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
+#else
+ return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
+#endif
}
INLINE uint32x4_t rot7_128(uint32x4_t x) {
- return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+ // See comment in rot16_128.
+ // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+ return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
}
// TODO: compress_neon
@@ -230,7 +245,6 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter,
counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
}
-static
void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse2.c b/llvm/lib/Support/BLAKE3/blake3_sse2.c
index f4449ac0b3cd8..691e1c6806cce 100644
--- a/llvm/lib/Support/BLAKE3/blake3_sse2.c
+++ b/llvm/lib/Support/BLAKE3/blake3_sse2.c
@@ -396,7 +396,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
}
INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+ // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
// 22/33. Note that this doesn't split the vector into two lanes, as the
// AVX2 counterparts do.
__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_gnu.S b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_gnu.S
index bf3b4523a9f1e..3a7163637f90c 100644
--- a/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_gnu.S
+++ b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_gnu.S
@@ -2303,7 +2303,7 @@ blake3_compress_xof_sse2:
ret
-.section .rodata
+.section .rdata
.p2align 6
BLAKE3_IV:
.long 0x6A09E667, 0xBB67AE85
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse41.c b/llvm/lib/Support/BLAKE3/blake3_sse41.c
index 87a8dae15ce9a..4653a856fe6c4 100644
--- a/llvm/lib/Support/BLAKE3/blake3_sse41.c
+++ b/llvm/lib/Support/BLAKE3/blake3_sse41.c
@@ -390,7 +390,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
}
INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+ // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
// 22/33. Note that this doesn't split the vector into two lanes, as the
// AVX2 counterparts do.
__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_gnu.S b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_gnu.S
index 28bdf3890a29f..b39180febf888 100644
--- a/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_gnu.S
+++ b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_gnu.S
@@ -2044,7 +2044,7 @@ blake3_compress_xof_sse41:
ret
-.section .rodata
+.section .rdata
.p2align 6
BLAKE3_IV:
.long 0x6A09E667, 0xBB67AE85
More information about the llvm-commits
mailing list