[llvm] Perform bitreverse using AVX512 GFNI for i32 and i64. (PR #81764)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 14 09:11:56 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: None (shamithoke)
<details>
<summary>Changes</summary>
Currently, the lowering operation for bitreverse using Intel AVX512 GFNI only supports byte vectors
Extend the operation to i32 and i64.
---
Patch is 22.51 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81764.diff
3 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+53-3)
- (modified) llvm/test/CodeGen/X86/bitreverse.ll (+54-227)
- (modified) llvm/test/CodeGen/X86/vector-bitreverse.ll (+106-44)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index db5e4fe84f410a..2ee00445f1c496 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1845,6 +1845,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
+ if (Subtarget.hasGFNI())
+ for (auto VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::i64,
+ MVT::v2i64, MVT::v4i64, MVT::v8i64})
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::SRL, VT, Custom);
@@ -31040,17 +31044,63 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
return DAG.getBitcast(VT, Res);
}
+static auto createBSWAPShuffleMask(EVT VT) {
+ SmallVector<int, 16> ShuffleMask;
+ int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
+ for (int I = 0, E = VT.getVectorNumElements(); I != E; ++I)
+ for (int J = ScalarSizeInBytes - 1; J >= 0; --J)
+ ShuffleMask.push_back((I * ScalarSizeInBytes) + J);
+
+ return ShuffleMask;
+}
+
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ SDLoc DL(Op);
+
+ auto HasGFNI = Subtarget.hasGFNI();
+ auto ScalarType = VT.getScalarType();
+
+ if (HasGFNI && ((ScalarType == MVT::i32) || (ScalarType == MVT::i64))) {
+ if (VT.isVector()) {
+ SmallVector<int, 16> BSWAPMask = createBSWAPShuffleMask(VT);
+ EVT ByteVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i8, BSWAPMask.size());
+ SDValue VecShuffle = DAG.getVectorShuffle(
+ ByteVT, DL, DAG.getNode(ISD::BITCAST, DL, ByteVT, In),
+ DAG.getUNDEF(ByteVT), BSWAPMask);
+ SDValue BitReverse = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, VecShuffle);
+ return DAG.getBitcast(VT, BitReverse);
+ } else {
+ auto CastTo = ScalarType == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
+ SDValue ScalarToVector =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, CastTo, In);
+ SDValue BitReverse =
+ DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
+ DAG.getBitcast(MVT::v16i8, ScalarToVector));
+ SDValue ExtractElementZero = DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, DL, ScalarType,
+ DAG.getBitcast(CastTo, BitReverse), DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::BSWAP, DL, ScalarType, ExtractElementZero);
+ }
+ }
if (Subtarget.hasXOP() && !VT.is512BitVector())
return LowerBITREVERSE_XOP(Op, DAG);
assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
- SDValue In = Op.getOperand(0);
- SDLoc DL(Op);
+ assert(VT.getScalarType() == MVT::i8 &&
+ "Only byte vector BITREVERSE supported");
+
+ // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
+
+ if (Subtarget.hasXOP() && !VT.is512BitVector())
+ return LowerBITREVERSE_XOP(Op, DAG);
+
+ assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
assert(VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported");
@@ -31066,7 +31116,7 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
unsigned NumElts = VT.getVectorNumElements();
// If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
- if (Subtarget.hasGFNI()) {
+ if (HasGFNI) {
MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
Matrix = DAG.getBitcast(VT, Matrix);
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 9daac1df1d9750..7ad1f9ce29e610 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -172,26 +172,10 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
;
; GFNI-LABEL: test_bitreverse_i64:
; GFNI: # %bb.0:
-; GFNI-NEXT: bswapq %rdi
-; GFNI-NEXT: movq %rdi, %rax
-; GFNI-NEXT: shrq $4, %rax
-; GFNI-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; GFNI-NEXT: andq %rcx, %rax
-; GFNI-NEXT: andq %rcx, %rdi
-; GFNI-NEXT: shlq $4, %rdi
-; GFNI-NEXT: orq %rax, %rdi
-; GFNI-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; GFNI-NEXT: movq %rdi, %rcx
-; GFNI-NEXT: andq %rax, %rcx
-; GFNI-NEXT: shrq $2, %rdi
-; GFNI-NEXT: andq %rax, %rdi
-; GFNI-NEXT: leaq (%rdi,%rcx,4), %rax
-; GFNI-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; GFNI-NEXT: movq %rax, %rdx
-; GFNI-NEXT: andq %rcx, %rdx
-; GFNI-NEXT: shrq %rax
-; GFNI-NEXT: andq %rcx, %rax
-; GFNI-NEXT: leaq (%rax,%rdx,2), %rax
+; GFNI-NEXT: vmovq %rdi, %xmm0
+; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; GFNI-NEXT: vmovq %xmm0, %rax
+; GFNI-NEXT: bswapq %rax
; GFNI-NEXT: retq
%b = call i64 @llvm.bitreverse.i64(i64 %a)
ret i64 %b
@@ -253,24 +237,10 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
;
; GFNI-LABEL: test_bitreverse_i32:
; GFNI: # %bb.0:
-; GFNI-NEXT: # kill: def $edi killed $edi def $rdi
-; GFNI-NEXT: bswapl %edi
-; GFNI-NEXT: movl %edi, %eax
-; GFNI-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
-; GFNI-NEXT: shll $4, %eax
-; GFNI-NEXT: shrl $4, %edi
-; GFNI-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
-; GFNI-NEXT: orl %eax, %edi
-; GFNI-NEXT: movl %edi, %eax
-; GFNI-NEXT: andl $858993459, %eax # imm = 0x33333333
-; GFNI-NEXT: shrl $2, %edi
-; GFNI-NEXT: andl $858993459, %edi # imm = 0x33333333
-; GFNI-NEXT: leal (%rdi,%rax,4), %eax
-; GFNI-NEXT: movl %eax, %ecx
-; GFNI-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; GFNI-NEXT: shrl %eax
-; GFNI-NEXT: andl $1431655765, %eax # imm = 0x55555555
-; GFNI-NEXT: leal (%rax,%rcx,2), %eax
+; GFNI-NEXT: vmovd %edi, %xmm0
+; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; GFNI-NEXT: vmovd %xmm0, %eax
+; GFNI-NEXT: bswapl %eax
; GFNI-NEXT: retq
%b = call i32 @llvm.bitreverse.i32(i32 %a)
ret i32 %b
@@ -335,24 +305,10 @@ define i24 @test_bitreverse_i24(i24 %a) nounwind {
;
; GFNI-LABEL: test_bitreverse_i24:
; GFNI: # %bb.0:
-; GFNI-NEXT: # kill: def $edi killed $edi def $rdi
-; GFNI-NEXT: bswapl %edi
-; GFNI-NEXT: movl %edi, %eax
-; GFNI-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
-; GFNI-NEXT: shll $4, %eax
-; GFNI-NEXT: shrl $4, %edi
-; GFNI-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
-; GFNI-NEXT: orl %eax, %edi
-; GFNI-NEXT: movl %edi, %eax
-; GFNI-NEXT: andl $858993459, %eax # imm = 0x33333333
-; GFNI-NEXT: shrl $2, %edi
-; GFNI-NEXT: andl $858993459, %edi # imm = 0x33333333
-; GFNI-NEXT: leal (%rdi,%rax,4), %eax
-; GFNI-NEXT: movl %eax, %ecx
-; GFNI-NEXT: andl $1431655680, %ecx # imm = 0x55555500
-; GFNI-NEXT: shrl %eax
-; GFNI-NEXT: andl $1431655680, %eax # imm = 0x55555500
-; GFNI-NEXT: leal (%rax,%rcx,2), %eax
+; GFNI-NEXT: vmovd %edi, %xmm0
+; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; GFNI-NEXT: vmovd %xmm0, %eax
+; GFNI-NEXT: bswapl %eax
; GFNI-NEXT: shrl $8, %eax
; GFNI-NEXT: retq
%b = call i24 @llvm.bitreverse.i24(i24 %a)
@@ -1412,196 +1368,67 @@ define i528 @large_promotion(i528 %A) nounwind {
;
; GFNI-LABEL: large_promotion:
; GFNI: # %bb.0:
-; GFNI-NEXT: pushq %r15
; GFNI-NEXT: pushq %r14
-; GFNI-NEXT: pushq %r13
-; GFNI-NEXT: pushq %r12
; GFNI-NEXT: pushq %rbx
; GFNI-NEXT: movq %rdi, %rax
-; GFNI-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; GFNI-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; GFNI-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; GFNI-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; GFNI-NEXT: vpbroadcastq {{.*#+}} xmm0 = [9241421688590303745,9241421688590303745]
+; GFNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vmovq %xmm1, %r10
+; GFNI-NEXT: bswapq %r10
+; GFNI-NEXT: vmovq %r9, %xmm1
+; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vmovq %xmm1, %rdi
; GFNI-NEXT: bswapq %rdi
-; GFNI-NEXT: movq %rdi, %r10
-; GFNI-NEXT: shrq $4, %r10
-; GFNI-NEXT: movabsq $1085102592571150095, %r11 # imm = 0xF0F0F0F0F0F0F0F
-; GFNI-NEXT: andq %r11, %r10
-; GFNI-NEXT: andq %r11, %rdi
-; GFNI-NEXT: shlq $4, %rdi
-; GFNI-NEXT: orq %r10, %rdi
-; GFNI-NEXT: movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333
-; GFNI-NEXT: movq %rdi, %r14
-; GFNI-NEXT: andq %r10, %r14
-; GFNI-NEXT: shrq $2, %rdi
-; GFNI-NEXT: andq %r10, %rdi
-; GFNI-NEXT: leaq (%rdi,%r14,4), %rdi
-; GFNI-NEXT: movabsq $6148820866244280320, %r14 # imm = 0x5555000000000000
-; GFNI-NEXT: movq %rdi, %r13
-; GFNI-NEXT: andq %r14, %r13
-; GFNI-NEXT: shrq %rdi
-; GFNI-NEXT: andq %r14, %rdi
-; GFNI-NEXT: leaq (%rdi,%r13,2), %rdi
-; GFNI-NEXT: bswapq %rbx
-; GFNI-NEXT: movq %rbx, %r14
-; GFNI-NEXT: shrq $4, %r14
-; GFNI-NEXT: andq %r11, %r14
-; GFNI-NEXT: andq %r11, %rbx
-; GFNI-NEXT: shlq $4, %rbx
-; GFNI-NEXT: orq %r14, %rbx
-; GFNI-NEXT: movq %rbx, %r14
-; GFNI-NEXT: andq %r10, %r14
-; GFNI-NEXT: shrq $2, %rbx
-; GFNI-NEXT: andq %r10, %rbx
-; GFNI-NEXT: leaq (%rbx,%r14,4), %rbx
-; GFNI-NEXT: movabsq $6148914691236517205, %r14 # imm = 0x5555555555555555
-; GFNI-NEXT: movq %rbx, %r13
-; GFNI-NEXT: andq %r14, %r13
-; GFNI-NEXT: shrq %rbx
-; GFNI-NEXT: andq %r14, %rbx
-; GFNI-NEXT: leaq (%rbx,%r13,2), %rbx
-; GFNI-NEXT: shrdq $48, %rbx, %rdi
-; GFNI-NEXT: bswapq %r15
-; GFNI-NEXT: movq %r15, %r13
-; GFNI-NEXT: shrq $4, %r13
-; GFNI-NEXT: andq %r11, %r13
-; GFNI-NEXT: andq %r11, %r15
-; GFNI-NEXT: shlq $4, %r15
-; GFNI-NEXT: orq %r13, %r15
-; GFNI-NEXT: movq %r15, %r13
-; GFNI-NEXT: andq %r10, %r13
-; GFNI-NEXT: shrq $2, %r15
-; GFNI-NEXT: andq %r10, %r15
-; GFNI-NEXT: leaq (%r15,%r13,4), %r15
-; GFNI-NEXT: movq %r15, %r13
-; GFNI-NEXT: andq %r14, %r13
-; GFNI-NEXT: shrq %r15
-; GFNI-NEXT: andq %r14, %r15
-; GFNI-NEXT: leaq (%r15,%r13,2), %r15
-; GFNI-NEXT: shrdq $48, %r15, %rbx
-; GFNI-NEXT: bswapq %r12
-; GFNI-NEXT: movq %r12, %r13
-; GFNI-NEXT: shrq $4, %r13
-; GFNI-NEXT: andq %r11, %r13
-; GFNI-NEXT: andq %r11, %r12
-; GFNI-NEXT: shlq $4, %r12
-; GFNI-NEXT: orq %r13, %r12
-; GFNI-NEXT: movq %r12, %r13
-; GFNI-NEXT: andq %r10, %r13
-; GFNI-NEXT: shrq $2, %r12
-; GFNI-NEXT: andq %r10, %r12
-; GFNI-NEXT: leaq (%r12,%r13,4), %r12
-; GFNI-NEXT: movq %r12, %r13
-; GFNI-NEXT: andq %r14, %r13
-; GFNI-NEXT: shrq %r12
-; GFNI-NEXT: andq %r14, %r12
-; GFNI-NEXT: leaq (%r12,%r13,2), %r12
-; GFNI-NEXT: shrdq $48, %r12, %r15
-; GFNI-NEXT: bswapq %r9
-; GFNI-NEXT: movq %r9, %r13
-; GFNI-NEXT: shrq $4, %r13
-; GFNI-NEXT: andq %r11, %r13
-; GFNI-NEXT: andq %r11, %r9
-; GFNI-NEXT: shlq $4, %r9
-; GFNI-NEXT: orq %r13, %r9
-; GFNI-NEXT: movq %r9, %r13
-; GFNI-NEXT: andq %r10, %r13
-; GFNI-NEXT: shrq $2, %r9
-; GFNI-NEXT: andq %r10, %r9
-; GFNI-NEXT: leaq (%r9,%r13,4), %r9
-; GFNI-NEXT: movq %r9, %r13
-; GFNI-NEXT: andq %r14, %r13
-; GFNI-NEXT: shrq %r9
-; GFNI-NEXT: andq %r14, %r9
-; GFNI-NEXT: leaq (%r9,%r13,2), %r9
-; GFNI-NEXT: shrdq $48, %r9, %r12
+; GFNI-NEXT: vmovq %r8, %xmm1
+; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vmovq %xmm1, %r8
; GFNI-NEXT: bswapq %r8
-; GFNI-NEXT: movq %r8, %r13
-; GFNI-NEXT: shrq $4, %r13
-; GFNI-NEXT: andq %r11, %r13
-; GFNI-NEXT: andq %r11, %r8
-; GFNI-NEXT: shlq $4, %r8
-; GFNI-NEXT: orq %r13, %r8
-; GFNI-NEXT: movq %r8, %r13
-; GFNI-NEXT: andq %r10, %r13
-; GFNI-NEXT: shrq $2, %r8
-; GFNI-NEXT: andq %r10, %r8
-; GFNI-NEXT: leaq (%r8,%r13,4), %r8
-; GFNI-NEXT: movq %r8, %r13
-; GFNI-NEXT: andq %r14, %r13
-; GFNI-NEXT: shrq %r8
-; GFNI-NEXT: andq %r14, %r8
-; GFNI-NEXT: leaq (%r8,%r13,2), %r8
-; GFNI-NEXT: shrdq $48, %r8, %r9
+; GFNI-NEXT: movq %r8, %r9
+; GFNI-NEXT: shldq $16, %rdi, %r9
+; GFNI-NEXT: shldq $16, %r10, %rdi
+; GFNI-NEXT: vmovq %rcx, %xmm1
+; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vmovq %xmm1, %rcx
; GFNI-NEXT: bswapq %rcx
-; GFNI-NEXT: movq %rcx, %r13
-; GFNI-NEXT: shrq $4, %r13
-; GFNI-NEXT: andq %r11, %r13
-; GFNI-NEXT: andq %r11, %rcx
-; GFNI-NEXT: shlq $4, %rcx
-; GFNI-NEXT: orq %r13, %rcx
-; GFNI-NEXT: movq %rcx, %r13
-; GFNI-NEXT: andq %r10, %r13
-; GFNI-NEXT: shrq $2, %rcx
-; GFNI-NEXT: andq %r10, %rcx
-; GFNI-NEXT: leaq (%rcx,%r13,4), %rcx
-; GFNI-NEXT: movq %rcx, %r13
-; GFNI-NEXT: andq %r14, %r13
-; GFNI-NEXT: shrq %rcx
-; GFNI-NEXT: andq %r14, %rcx
-; GFNI-NEXT: leaq (%rcx,%r13,2), %rcx
; GFNI-NEXT: shrdq $48, %rcx, %r8
+; GFNI-NEXT: vmovq %rdx, %xmm1
+; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vmovq %xmm1, %rdx
; GFNI-NEXT: bswapq %rdx
-; GFNI-NEXT: movq %rdx, %r13
-; GFNI-NEXT: shrq $4, %r13
-; GFNI-NEXT: andq %r11, %r13
-; GFNI-NEXT: andq %r11, %rdx
-; GFNI-NEXT: shlq $4, %rdx
-; GFNI-NEXT: orq %r13, %rdx
-; GFNI-NEXT: movq %rdx, %r13
-; GFNI-NEXT: andq %r10, %r13
-; GFNI-NEXT: shrq $2, %rdx
-; GFNI-NEXT: andq %r10, %rdx
-; GFNI-NEXT: leaq (%rdx,%r13,4), %rdx
-; GFNI-NEXT: movq %rdx, %r13
-; GFNI-NEXT: andq %r14, %r13
-; GFNI-NEXT: shrq %rdx
-; GFNI-NEXT: andq %r14, %rdx
-; GFNI-NEXT: leaq (%rdx,%r13,2), %rdx
; GFNI-NEXT: shrdq $48, %rdx, %rcx
+; GFNI-NEXT: vmovq %rsi, %xmm1
+; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vmovq %xmm1, %rsi
; GFNI-NEXT: bswapq %rsi
-; GFNI-NEXT: movq %rsi, %r13
-; GFNI-NEXT: shrq $4, %r13
-; GFNI-NEXT: andq %r11, %r13
-; GFNI-NEXT: andq %r11, %rsi
-; GFNI-NEXT: shlq $4, %rsi
-; GFNI-NEXT: orq %r13, %rsi
-; GFNI-NEXT: movq %rsi, %r11
-; GFNI-NEXT: andq %r10, %r11
-; GFNI-NEXT: shrq $2, %rsi
-; GFNI-NEXT: andq %r10, %rsi
-; GFNI-NEXT: leaq (%rsi,%r11,4), %rsi
-; GFNI-NEXT: movq %rsi, %r10
-; GFNI-NEXT: andq %r14, %r10
-; GFNI-NEXT: shrq %rsi
-; GFNI-NEXT: andq %r14, %rsi
-; GFNI-NEXT: leaq (%rsi,%r10,2), %rsi
; GFNI-NEXT: shrdq $48, %rsi, %rdx
+; GFNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vmovq %xmm1, %r11
+; GFNI-NEXT: bswapq %r11
+; GFNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vmovq %xmm1, %rbx
+; GFNI-NEXT: bswapq %rbx
+; GFNI-NEXT: shrdq $48, %rbx, %r11
+; GFNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm0
+; GFNI-NEXT: vmovq %xmm0, %r14
+; GFNI-NEXT: bswapq %r14
+; GFNI-NEXT: shrdq $48, %r14, %rbx
+; GFNI-NEXT: shrdq $48, %r10, %r14
; GFNI-NEXT: shrq $48, %rsi
+; GFNI-NEXT: movq %r14, 16(%rax)
+; GFNI-NEXT: movq %rbx, 8(%rax)
+; GFNI-NEXT: movq %r11, (%rax)
; GFNI-NEXT: movq %rdx, 56(%rax)
; GFNI-NEXT: movq %rcx, 48(%rax)
; GFNI-NEXT: movq %r8, 40(%rax)
; GFNI-NEXT: movq %r9, 32(%rax)
-; GFNI-NEXT: movq %r12, 24(%rax)
-; GFNI-NEXT: movq %r15, 16(%rax)
-; GFNI-NEXT: movq %rbx, 8(%rax)
-; GFNI-NEXT: movq %rdi, (%rax)
+; GFNI-NEXT: movq %rdi, 24(%rax)
; GFNI-NEXT: movw %si, 64(%rax)
; GFNI-NEXT: popq %rbx
-; GFNI-NEXT: popq %r12
-; GFNI-NEXT: popq %r13
; GFNI-NEXT: popq %r14
-; GFNI-NEXT: popq %r15
; GFNI-NEXT: retq
%Z = call i528 @llvm.bitreverse.i528(i528 %A)
ret i528 %Z
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index d3f357cd179525..6354477abccfce 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -274,27 +274,57 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax
; GFNISSE-NEXT: retq
;
-; GFNIAVX-LABEL: test_bitreverse_i32:
-; GFNIAVX: # %bb.0:
-; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi
-; GFNIAVX-NEXT: bswapl %edi
-; GFNIAVX-NEXT: movl %edi, %eax
-; GFNIAVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
-; GFNIAVX-NEXT: shll $4, %eax
-; GFNIAVX-NEXT: shrl $4, %edi
-; GFNIAVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
-; GFNIAVX-NEXT: orl %eax, %edi
-; GFNIAVX-NEXT: movl %edi, %eax
-; GFNIAVX-NEXT: andl $858993459, %eax # imm = 0x33333333
-; GFNIAVX-NEXT: shrl $2, %edi
-; GFNIAVX-NEXT: andl $858993459, %edi # imm = 0x33333333
-; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax
-; GFNIAVX-NEXT: movl %eax, %ecx
-; GFNIAVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; GFNIAVX-NEXT: shrl %eax
-; GFNIAVX-NEXT: andl $1431655765, %eax # imm = 0x55555555
-; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax
-; GFNIAVX-NEXT: retq
+; GFNIAVX1-LABEL: test_bitreverse_i32:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: # kill: def $edi killed $edi def $rdi
+; GFNIAVX1-NEXT: bswapl %edi
+; GFNIAVX1-NEXT: movl %edi, %eax
+; GFNIAVX1-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
+; GFNIAVX1-NEXT: shll $4, %eax
+; GFNIAVX1-NEXT: shrl $4, %edi
+; GFNIAVX1-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
+; GFNIAVX1-NEXT: orl %eax, %edi
+; GFNIAVX1-NEXT: movl %edi, %eax
+; GFNIAVX1-NEXT: andl $858993459, %eax # imm = 0x33333333
+; GFNIAVX1-NEXT: shrl $2, %edi
+; GFNIAVX1-NEXT: andl $858993459, %edi # imm = 0x33333333
+; GFNIAVX1-NEXT: leal (%rdi,%rax,4), %eax
+; GFNIAVX1-NEXT: movl %eax, %ecx
+; GFNIAVX1-NEXT: andl $1431655765, %ecx # imm = 0x55555555
+; GFNIAVX1-NEXT: shrl %eax
+; GFNIAVX1-NEXT: andl $1431655765, %eax # imm = 0x55555555
+; GFNIAVX1-NEXT: leal (%rax,%rcx,2), %eax
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_i32:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: # kill: def $edi killed $edi def $rdi
+; GFNIAVX2-NEXT: bswapl %edi
+; GFNIAVX2-NEXT: movl %edi, %eax
+; GFNIAVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
+; GFNIAVX2-NEXT: shll $4, %eax
+; GFNIAVX2-NEXT: shrl $4, %edi
+; GFNIAVX2-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
+; GFNIAVX2-NEXT: orl %eax, %edi
+; GFNIAVX2-NEXT: movl %edi, %eax
+; GFNIAVX2-NEXT: andl $858993459, %eax # imm = 0x33333333
+; GFNIAVX2-NEXT: shrl $2, %edi
+; GFNIAVX2-NEXT: andl $858993459, %edi # imm = 0x33333333
+; GFNIAVX2-NEXT: leal (%rdi,%rax,4), %eax
+; GFNIAVX2-NEXT: movl %eax, %ecx
+; GFNIAVX2-NEXT: andl $1431655765, %ecx # imm = 0x55555555
+; GFNIAVX2-NEXT: shrl %eax
+; GFNIAVX2-NEXT: andl $1431655765, %eax # imm = 0x55555555
+; GFNIAVX2-NEXT: leal (%rax,%rcx,2), %eax
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512-LABEL: test_bitreverse_i32:
+; GFNIAVX512: # %bb.0:
+; GFNIAVX512-NEXT: vmovd %edi, %xmm0
+; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX512-NEXT: vmovd %xmm0, %eax
+; GFNIAVX512-NEXT: bswapl %eax
+; GFNIAVX512-NEXT: retq
%b = call i32 @llvm.bitreverse.i32(i32 %a)
ret i32 %b
}
@@ -379,29 +409,61 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
; GFNISSE-NEXT: leaq (%rax,%rdx,2), %rax
; GFNISSE-NEXT: retq
;
-; GFNIAVX-LABEL: test_bitreverse_i64:
-; GFNIAVX: # %bb.0:
-; GFNIAVX-NEXT: bswapq %rdi
-; GFNIAVX-NEXT: movq %rdi, %rax
-; GFNIAVX-NEXT: shrq $4, %rax
-; GFNIAVX-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; GFNIAVX-NEXT: andq %rcx, %rax
-; GFNIAVX-NEXT: andq %rcx, %rdi
-; GFNIAVX-NEXT: shlq $4, %rdi
-; GFNIAVX-NEXT: orq %rax, %rdi
-; GFNIAVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; GFNIAVX-NEXT: movq %rdi, %rcx
-; GFNIAVX-NEXT: andq %rax, %rcx
-; GFNIAVX-NEXT: shrq $2, %rdi
-; GFNIAVX-NEXT: andq %rax, %rdi
-; GFNIAVX-NEXT: leaq (%rdi,%rcx,4), %rax
-; GFNIAVX-NEXT: movabsq $6148...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/81764
More information about the llvm-commits
mailing list