[llvm] [x32] Fix BLAKE3 assembly (PR #149617)
Harald van Dijk via llvm-commits
llvm-commits at lists.llvm.org
Sat Jul 19 03:53:49 PDT 2025
https://github.com/hvdijk updated https://github.com/llvm/llvm-project/pull/149617
>From db270fb27d9d2461f9f80ee29c165839f4d09103 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald at gigawatt.nl>
Date: Sat, 19 Jul 2025 00:47:46 +0100
Subject: [PATCH] [x32] Fix BLAKE3 assembly
The x86-64 assembly implementations of BLAKE3 are used both in 64-bit
and in 32-bit pointer mode, but only worked in 64-bit pointer mode. This
PR adds support to also allow them to work in 32-bit pointer mode.
---
.../Support/BLAKE3/blake3_avx2_x86-64_unix.S | 43 +++++++++
.../BLAKE3/blake3_avx512_x86-64_unix.S | 91 +++++++++++++++++++
.../Support/BLAKE3/blake3_sse2_x86-64_unix.S | 28 ++++++
.../Support/BLAKE3/blake3_sse41_x86-64_unix.S | 28 ++++++
4 files changed, 190 insertions(+)
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S
index e98893c7ef8b8..f285fe119f4c1 100644
--- a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S
@@ -45,6 +45,10 @@ blake3_hash_many_avx2:
mov rbp, rsp
sub rsp, 680
and rsp, 0xFFFFFFFFFFFFFFC0
+#ifdef _ILP32
+ mov esi, esi
+ mov edx, edx
+#endif
neg r9d
vmovd xmm0, r9d
vpbroadcastd ymm0, xmm0
@@ -77,6 +81,7 @@ blake3_hash_many_avx2:
vpbroadcastd ymm5, dword ptr [rcx+0x14]
vpbroadcastd ymm6, dword ptr [rcx+0x18]
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
@@ -85,6 +90,16 @@ blake3_hash_many_avx2:
mov r13, qword ptr [rdi+0x28]
mov r14, qword ptr [rdi+0x30]
mov r15, qword ptr [rdi+0x38]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+ mov r12d, dword ptr [rdi+0x10]
+ mov r13d, dword ptr [rdi+0x14]
+ mov r14d, dword ptr [rdi+0x18]
+ mov r15d, dword ptr [rdi+0x1c]
+#endif
movzx eax, byte ptr [rbp+0x38]
movzx ebx, byte ptr [rbp+0x40]
or eax, ebx
@@ -1305,7 +1320,11 @@ blake3_hash_many_avx2:
vmovdqa ymm0, ymmword ptr [rsp+0x260]
vpsubd ymm2, ymm0, ymm2
vmovdqa ymmword ptr [rsp+0x260], ymm2
+#ifndef _ILP32
add rdi, 64
+#else
+ add rdi, 32
+#endif
add rbx, 256
mov qword ptr [rbp+0x50], rbx
sub rsi, 8
@@ -1346,10 +1365,17 @@ blake3_hash_many_avx2:
vpblendd ymm15, ymm15, ymm12, 0x44
vmovdqa ymmword ptr [rsp], ymm14
vmovdqa ymmword ptr [rsp+0x20], ymm15
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1557,7 +1583,11 @@ blake3_hash_many_avx2:
vmovaps xmmword ptr [rsp+0x240], xmm0
vmovaps xmmword ptr [rsp+0x260], xmm2
add rbx, 128
+#ifndef _ILP32
add rdi, 32
+#else
+ add rdi, 16
+#endif
sub rsi, 4
3:
test rsi, 0x2
@@ -1573,8 +1603,13 @@ blake3_hash_many_avx2:
vinserti128 ymm13, ymm13, xmm14, 0x01
vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1683,7 +1718,11 @@ blake3_hash_many_avx2:
vmovaps ymmword ptr [rsp+0x240], ymm0
vmovaps ymmword ptr [rsp+0x260], ymm2
add rbx, 64
+#ifndef _ILP32
add rdi, 16
+#else
+ add rdi, 8
+#endif
sub rsi, 2
3:
test rsi, 0x1
@@ -1695,7 +1734,11 @@ blake3_hash_many_avx2:
vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vmovdqa xmm14, xmmword ptr [ROT16+rip]
vmovdqa xmm15, xmmword ptr [ROT8+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
+#else
+ mov r8d, dword ptr [rdi]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
index b4b14946de10e..709c4752d4084 100644
--- a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
@@ -59,6 +59,10 @@ blake3_hash_many_avx512:
sub rsp, 144
and rsp, 0xFFFFFFFFFFFFFFC0
neg r9
+#ifdef _ILP32
+ mov esi, esi
+ mov edx, edx
+#endif
kmovw k1, r9d
vmovd xmm0, r8d
vpbroadcastd ymm0, xmm0
@@ -107,6 +111,7 @@ blake3_hash_many_avx512:
cmp rdx, qword ptr [rsp+0x80]
cmove eax, ebx
mov dword ptr [rsp+0x88], eax
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
@@ -115,6 +120,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x48]
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+ mov r12d, dword ptr [rdi+0x20]
+ mov r13d, dword ptr [rdi+0x24]
+ mov r14d, dword ptr [rdi+0x28]
+ mov r15d, dword ptr [rdi+0x2c]
+#endif
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
@@ -127,6 +142,7 @@ blake3_hash_many_avx512:
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm10, zmm18, zmm19
vpunpckhqdq zmm11, zmm18, zmm19
+#ifndef _ILP32
mov r8, qword ptr [rdi+0x20]
mov r9, qword ptr [rdi+0x28]
mov r10, qword ptr [rdi+0x30]
@@ -135,6 +151,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x68]
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
+#else
+ mov r8d, dword ptr [rdi+0x10]
+ mov r9d, dword ptr [rdi+0x14]
+ mov r10d, dword ptr [rdi+0x18]
+ mov r11d, dword ptr [rdi+0x1c]
+ mov r12d, dword ptr [rdi+0x30]
+ mov r13d, dword ptr [rdi+0x34]
+ mov r14d, dword ptr [rdi+0x38]
+ mov r15d, dword ptr [rdi+0x3c]
+#endif
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
@@ -169,6 +195,7 @@ blake3_hash_many_avx512:
vmovdqa32 zmm23, zmm19
vpermt2d zmm19, zmm27, zmm8
vpermt2d zmm23, zmm31, zmm8
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
@@ -177,6 +204,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x48]
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+ mov r12d, dword ptr [rdi+0x20]
+ mov r13d, dword ptr [rdi+0x24]
+ mov r14d, dword ptr [rdi+0x28]
+ mov r15d, dword ptr [rdi+0x2c]
+#endif
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
@@ -197,6 +234,7 @@ blake3_hash_many_avx512:
prefetcht0 [r14+rdx+0x80]
prefetcht0 [r11+rdx+0x80]
prefetcht0 [r15+rdx+0x80]
+#ifndef _ILP32
mov r8, qword ptr [rdi+0x20]
mov r9, qword ptr [rdi+0x28]
mov r10, qword ptr [rdi+0x30]
@@ -205,6 +243,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x68]
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
+#else
+ mov r8d, dword ptr [rdi+0x10]
+ mov r9d, dword ptr [rdi+0x14]
+ mov r10d, dword ptr [rdi+0x18]
+ mov r11d, dword ptr [rdi+0x1c]
+ mov r12d, dword ptr [rdi+0x30]
+ mov r13d, dword ptr [rdi+0x34]
+ mov r14d, dword ptr [rdi+0x38]
+ mov r15d, dword ptr [rdi+0x3c]
+#endif
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
@@ -1095,7 +1143,11 @@ blake3_hash_many_avx512:
vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
vmovdqa32 zmmword ptr [rsp], zmm2
vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
+#ifndef _ILP32
add rdi, 128
+#else
+ add rdi, 64
+#endif
add rbx, 512
mov qword ptr [rbp+0x50], rbx
sub rsi, 16
@@ -1125,6 +1177,7 @@ blake3_hash_many_avx512:
vpbroadcastd ymm5, dword ptr [rcx+0x14]
vpbroadcastd ymm6, dword ptr [rcx+0x18]
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
@@ -1133,6 +1186,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x28]
mov r14, qword ptr [rdi+0x30]
mov r15, qword ptr [rdi+0x38]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+ mov r12d, dword ptr [rdi+0x10]
+ mov r13d, dword ptr [rdi+0x14]
+ mov r14d, dword ptr [rdi+0x18]
+ mov r15d, dword ptr [rdi+0x1c]
+#endif
movzx eax, byte ptr [rbp+0x38]
movzx ebx, byte ptr [rbp+0x40]
or eax, ebx
@@ -2055,7 +2118,11 @@ blake3_hash_many_avx512:
vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
add rbx, 256
mov qword ptr [rbp+0x50], rbx
+#ifndef _ILP32
add rdi, 64
+#else
+ add rdi, 32
+#endif
sub rsi, 8
3:
mov rbx, qword ptr [rbp+0x50]
@@ -2078,10 +2145,17 @@ blake3_hash_many_avx512:
kmovw k2, eax
vpblendmd zmm13 {k2}, zmm13, zmm12
vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+#endif
mov eax, 43690
kmovw k3, eax
mov eax, 34952
@@ -2195,7 +2269,11 @@ blake3_hash_many_avx512:
vmovdqa xmmword ptr [rsp], xmm0
vmovdqa xmmword ptr [rsp+0x40], xmm2
add rbx, 128
+#ifndef _ILP32
add rdi, 32
+#else
+ add rdi, 16
+#endif
sub rsi, 4
3:
test esi, 0x2
@@ -2209,8 +2287,13 @@ blake3_hash_many_avx512:
vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vinserti128 ymm13, ymm13, xmm14, 0x01
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -2308,7 +2391,11 @@ blake3_hash_many_avx512:
vmovdqa xmmword ptr [rsp], xmm0
vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
add rbx, 64
+#ifndef _ILP32
add rdi, 16
+#else
+ add rdi, 8
+#endif
sub rsi, 2
3:
test esi, 0x1
@@ -2319,7 +2406,11 @@ blake3_hash_many_avx512:
vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
+#else
+ mov r8d, dword ptr [rdi]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S
index d69a1706fefe7..85434df927cdd 100644
--- a/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S
@@ -54,6 +54,10 @@ blake3_hash_many_sse2:
sub rsp, 360
and rsp, 0xFFFFFFFFFFFFFFC0
neg r9d
+#ifdef _ILP32
+ mov esi, esi
+ mov edx, edx
+#endif
movd xmm0, r9d
pshufd xmm0, xmm0, 0x00
movdqa xmmword ptr [rsp+0x130], xmm0
@@ -91,10 +95,17 @@ blake3_hash_many_sse2:
pshufd xmm5, xmm7, 0x55
pshufd xmm6, xmm7, 0xAA
pshufd xmm7, xmm7, 0xFF
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1648,7 +1659,11 @@ blake3_hash_many_sse2:
psubd xmm1, xmm0
movdqa xmmword ptr [rsp+0x120], xmm1
add rbx, 128
+#ifndef _ILP32
add rdi, 32
+#else
+ add rdi, 16
+#endif
sub rsi, 4
cmp rsi, 4
jnc 2b
@@ -1679,8 +1694,13 @@ blake3_hash_many_sse2:
movd xmm13, dword ptr [rsp+0x124]
punpckldq xmm14, xmm13
movaps xmmword ptr [rsp+0x10], xmm14
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1909,7 +1929,11 @@ blake3_hash_many_sse2:
mov r11d, dword ptr [rsp+0x120+8*rax]
mov dword ptr [rsp+0x110], r10d
mov dword ptr [rsp+0x120], r11d
+#ifndef _ILP32
add rdi, 16
+#else
+ add rdi, 8
+#endif
add rbx, 64
sub rsi, 2
3:
@@ -1920,7 +1944,11 @@ blake3_hash_many_sse2:
movd xmm13, dword ptr [rsp+0x110]
movd xmm14, dword ptr [rsp+0x120]
punpckldq xmm13, xmm14
+#ifndef _ILP32
mov r8, qword ptr [rdi]
+#else
+ mov r8d, dword ptr [rdi]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S
index c5b103af61c4f..403773421587c 100644
--- a/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S
@@ -54,6 +54,10 @@ blake3_hash_many_sse41:
sub rsp, 360
and rsp, 0xFFFFFFFFFFFFFFC0
neg r9d
+#ifdef _ILP32
+ mov esi, esi
+ mov edx, edx
+#endif
movd xmm0, r9d
pshufd xmm0, xmm0, 0x00
movdqa xmmword ptr [rsp+0x130], xmm0
@@ -91,10 +95,17 @@ blake3_hash_many_sse41:
pshufd xmm5, xmm7, 0x55
pshufd xmm6, xmm7, 0xAA
pshufd xmm7, xmm7, 0xFF
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1452,7 +1463,11 @@ blake3_hash_many_sse41:
psubd xmm1, xmm0
movdqa xmmword ptr [rsp+0x120], xmm1
add rbx, 128
+#ifndef _ILP32
add rdi, 32
+#else
+ add rdi, 16
+#endif
sub rsi, 4
cmp rsi, 4
jnc 2b
@@ -1483,8 +1498,13 @@ blake3_hash_many_sse41:
pinsrd xmm14, dword ptr [rsp+0x124], 1
pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmmword ptr [rsp+0x10], xmm14
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1686,7 +1706,11 @@ blake3_hash_many_sse41:
blendvps xmm2, xmm4, xmm0
movdqa xmmword ptr [rsp+0x110], xmm1
movdqa xmmword ptr [rsp+0x120], xmm2
+#ifndef _ILP32
add rdi, 16
+#else
+ add rdi, 8
+#endif
add rbx, 64
sub rsi, 2
3:
@@ -1699,7 +1723,11 @@ blake3_hash_many_sse41:
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmm14, xmmword ptr [ROT8+rip]
movaps xmm15, xmmword ptr [ROT16+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
+#else
+ mov r8d, dword ptr [rdi]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
More information about the llvm-commits
mailing list