[llvm] Perform bitreverse using AVX512 GFNI for i32 and i64. (PR #81764)

Wed Apr 10 10:41:57 PDT 2024

https://github.com/shamithoke updated https://github.com/llvm/llvm-project/pull/81764

>From 9bcb9c21dd9d5607a7a466d4ad36d6edd9a932d3 Mon Sep 17 00:00:00 2001
From: shami <shami_thoke at yahoo.com>
Date: Wed, 14 Feb 2024 22:38:14 +0530
Subject: [PATCH 1/6] Perform bitreverse using AVX512 GFNI for i32 and i64.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp    |  52 +++-
 llvm/test/CodeGen/X86/bitreverse.ll        | 281 ++++-----------------
 llvm/test/CodeGen/X86/vector-bitreverse.ll | 150 +++++++----
 3 files changed, 209 insertions(+), 274 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 010f9c30ab4033..35370762fe701c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31312,17 +31312,63 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
   return DAG.getBitcast(VT, Res);
 }
 
+static auto createBSWAPShuffleMask(EVT VT) {
+  SmallVector<int, 16> ShuffleMask;
+  int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
+  for (int I = 0, E = VT.getVectorNumElements(); I != E; ++I)
+    for (int J = ScalarSizeInBytes - 1; J >= 0; --J)
+      ShuffleMask.push_back((I * ScalarSizeInBytes) + J);
+
+  return ShuffleMask;
+} 
+
 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  SDLoc DL(Op);
+
+  auto HasGFNI = Subtarget.hasGFNI();
+  auto ScalarType = VT.getScalarType();
+
+  if (HasGFNI && ((ScalarType == MVT::i32) || (ScalarType == MVT::i64))) {
+    if (VT.isVector()) {
+      SmallVector<int, 16> BSWAPMask = createBSWAPShuffleMask(VT);
+      EVT ByteVT =
+          EVT::getVectorVT(*DAG.getContext(), MVT::i8, BSWAPMask.size());
+      SDValue VecShuffle = DAG.getVectorShuffle(
+          ByteVT, DL, DAG.getNode(ISD::BITCAST, DL, ByteVT, In),
+          DAG.getUNDEF(ByteVT), BSWAPMask);
+      SDValue BitReverse = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, VecShuffle);
+      return DAG.getBitcast(VT, BitReverse);
+    } else {
+      auto CastTo = ScalarType == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
+      SDValue ScalarToVector =
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, CastTo, In);
+      SDValue BitReverse =
+          DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
+                      DAG.getBitcast(MVT::v16i8, ScalarToVector));
+      SDValue ExtractElementZero = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, DL, ScalarType,
+          DAG.getBitcast(CastTo, BitReverse), DAG.getIntPtrConstant(0, DL));
+      return DAG.getNode(ISD::BSWAP, DL, ScalarType, ExtractElementZero);
+    }
+  }
 
   if (Subtarget.hasXOP() && !VT.is512BitVector())
     return LowerBITREVERSE_XOP(Op, DAG);
 
   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
 
-  SDValue In = Op.getOperand(0);
-  SDLoc DL(Op);
+  assert(VT.getScalarType() == MVT::i8 &&
+         "Only byte vector BITREVERSE supported");
+
+  // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
+
+  if (Subtarget.hasXOP() && !VT.is512BitVector())
+    return LowerBITREVERSE_XOP(Op, DAG);
+
+  assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
 
   // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
   if (VT.is512BitVector() && !Subtarget.hasBWI())
@@ -31346,7 +31392,7 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   unsigned NumElts = VT.getVectorNumElements();
 
   // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
-  if (Subtarget.hasGFNI()) {
+  if (HasGFNI) {
     MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
     SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
     Matrix = DAG.getBitcast(VT, Matrix);
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 26b1d64874e590..704563ab1bbf70 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -172,26 +172,10 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ;
 ; GFNI-LABEL: test_bitreverse_i64:
 ; GFNI:       # %bb.0:
-; GFNI-NEXT:    bswapq %rdi
-; GFNI-NEXT:    movq %rdi, %rax
-; GFNI-NEXT:    shrq $4, %rax
-; GFNI-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; GFNI-NEXT:    andq %rcx, %rax
-; GFNI-NEXT:    andq %rcx, %rdi
-; GFNI-NEXT:    shlq $4, %rdi
-; GFNI-NEXT:    orq %rax, %rdi
-; GFNI-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; GFNI-NEXT:    movq %rdi, %rcx
-; GFNI-NEXT:    andq %rax, %rcx
-; GFNI-NEXT:    shrq $2, %rdi
-; GFNI-NEXT:    andq %rax, %rdi
-; GFNI-NEXT:    leaq (%rdi,%rcx,4), %rax
-; GFNI-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; GFNI-NEXT:    movq %rax, %rdx
-; GFNI-NEXT:    andq %rcx, %rdx
-; GFNI-NEXT:    shrq %rax
-; GFNI-NEXT:    andq %rcx, %rax
-; GFNI-NEXT:    leaq (%rax,%rdx,2), %rax
+; GFNI-NEXT:    vmovq %rdi, %xmm0
+; GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; GFNI-NEXT:    vmovq %xmm0, %rax
+; GFNI-NEXT:    bswapq %rax
 ; GFNI-NEXT:    retq
   %b = call i64 @llvm.bitreverse.i64(i64 %a)
   ret i64 %b
@@ -253,24 +237,10 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ;
 ; GFNI-LABEL: test_bitreverse_i32:
 ; GFNI:       # %bb.0:
-; GFNI-NEXT:    # kill: def $edi killed $edi def $rdi
-; GFNI-NEXT:    bswapl %edi
-; GFNI-NEXT:    movl %edi, %eax
-; GFNI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; GFNI-NEXT:    shll $4, %eax
-; GFNI-NEXT:    shrl $4, %edi
-; GFNI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; GFNI-NEXT:    orl %eax, %edi
-; GFNI-NEXT:    movl %edi, %eax
-; GFNI-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; GFNI-NEXT:    shrl $2, %edi
-; GFNI-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; GFNI-NEXT:    leal (%rdi,%rax,4), %eax
-; GFNI-NEXT:    movl %eax, %ecx
-; GFNI-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; GFNI-NEXT:    shrl %eax
-; GFNI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; GFNI-NEXT:    leal (%rax,%rcx,2), %eax
+; GFNI-NEXT:    vmovd %edi, %xmm0
+; GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; GFNI-NEXT:    vmovd %xmm0, %eax
+; GFNI-NEXT:    bswapl %eax
 ; GFNI-NEXT:    retq
   %b = call i32 @llvm.bitreverse.i32(i32 %a)
   ret i32 %b
@@ -335,24 +305,10 @@ define i24 @test_bitreverse_i24(i24 %a) nounwind {
 ;
 ; GFNI-LABEL: test_bitreverse_i24:
 ; GFNI:       # %bb.0:
-; GFNI-NEXT:    # kill: def $edi killed $edi def $rdi
-; GFNI-NEXT:    bswapl %edi
-; GFNI-NEXT:    movl %edi, %eax
-; GFNI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; GFNI-NEXT:    shll $4, %eax
-; GFNI-NEXT:    shrl $4, %edi
-; GFNI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; GFNI-NEXT:    orl %eax, %edi
-; GFNI-NEXT:    movl %edi, %eax
-; GFNI-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; GFNI-NEXT:    shrl $2, %edi
-; GFNI-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; GFNI-NEXT:    leal (%rdi,%rax,4), %eax
-; GFNI-NEXT:    movl %eax, %ecx
-; GFNI-NEXT:    andl $1431655680, %ecx # imm = 0x55555500
-; GFNI-NEXT:    shrl %eax
-; GFNI-NEXT:    andl $1431655680, %eax # imm = 0x55555500
-; GFNI-NEXT:    leal (%rax,%rcx,2), %eax
+; GFNI-NEXT:    vmovd %edi, %xmm0
+; GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; GFNI-NEXT:    vmovd %xmm0, %eax
+; GFNI-NEXT:    bswapl %eax
 ; GFNI-NEXT:    shrl $8, %eax
 ; GFNI-NEXT:    retq
   %b = call i24 @llvm.bitreverse.i24(i24 %a)
@@ -1412,196 +1368,67 @@ define i528 @large_promotion(i528 %A) nounwind {
 ;
 ; GFNI-LABEL: large_promotion:
 ; GFNI:       # %bb.0:
-; GFNI-NEXT:    pushq %r15
 ; GFNI-NEXT:    pushq %r14
-; GFNI-NEXT:    pushq %r13
-; GFNI-NEXT:    pushq %r12
 ; GFNI-NEXT:    pushq %rbx
 ; GFNI-NEXT:    movq %rdi, %rax
-; GFNI-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; GFNI-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; GFNI-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; GFNI-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; GFNI-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [9241421688590303745,9241421688590303745]
+; GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT:    vmovq %xmm1, %r10
+; GFNI-NEXT:    bswapq %r10
+; GFNI-NEXT:    vmovq %r9, %xmm1
+; GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT:    vmovq %xmm1, %rdi
 ; GFNI-NEXT:    bswapq %rdi
-; GFNI-NEXT:    movq %rdi, %r10
-; GFNI-NEXT:    shrq $4, %r10
-; GFNI-NEXT:    movabsq $1085102592571150095, %r11 # imm = 0xF0F0F0F0F0F0F0F
-; GFNI-NEXT:    andq %r11, %r10
-; GFNI-NEXT:    andq %r11, %rdi
-; GFNI-NEXT:    shlq $4, %rdi
-; GFNI-NEXT:    orq %r10, %rdi
-; GFNI-NEXT:    movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333
-; GFNI-NEXT:    movq %rdi, %r14
-; GFNI-NEXT:    andq %r10, %r14
-; GFNI-NEXT:    shrq $2, %rdi
-; GFNI-NEXT:    andq %r10, %rdi
-; GFNI-NEXT:    leaq (%rdi,%r14,4), %rdi
-; GFNI-NEXT:    movabsq $6148820866244280320, %r14 # imm = 0x5555000000000000
-; GFNI-NEXT:    movq %rdi, %r13
-; GFNI-NEXT:    andq %r14, %r13
-; GFNI-NEXT:    shrq %rdi
-; GFNI-NEXT:    andq %r14, %rdi
-; GFNI-NEXT:    leaq (%rdi,%r13,2), %rdi
-; GFNI-NEXT:    bswapq %rbx
-; GFNI-NEXT:    movq %rbx, %r14
-; GFNI-NEXT:    shrq $4, %r14
-; GFNI-NEXT:    andq %r11, %r14
-; GFNI-NEXT:    andq %r11, %rbx
-; GFNI-NEXT:    shlq $4, %rbx
-; GFNI-NEXT:    orq %r14, %rbx
-; GFNI-NEXT:    movq %rbx, %r14
-; GFNI-NEXT:    andq %r10, %r14
-; GFNI-NEXT:    shrq $2, %rbx
-; GFNI-NEXT:    andq %r10, %rbx
-; GFNI-NEXT:    leaq (%rbx,%r14,4), %rbx
-; GFNI-NEXT:    movabsq $6148914691236517205, %r14 # imm = 0x5555555555555555
-; GFNI-NEXT:    movq %rbx, %r13
-; GFNI-NEXT:    andq %r14, %r13
-; GFNI-NEXT:    shrq %rbx
-; GFNI-NEXT:    andq %r14, %rbx
-; GFNI-NEXT:    leaq (%rbx,%r13,2), %rbx
-; GFNI-NEXT:    shrdq $48, %rbx, %rdi
-; GFNI-NEXT:    bswapq %r15
-; GFNI-NEXT:    movq %r15, %r13
-; GFNI-NEXT:    shrq $4, %r13
-; GFNI-NEXT:    andq %r11, %r13
-; GFNI-NEXT:    andq %r11, %r15
-; GFNI-NEXT:    shlq $4, %r15
-; GFNI-NEXT:    orq %r13, %r15
-; GFNI-NEXT:    movq %r15, %r13
-; GFNI-NEXT:    andq %r10, %r13
-; GFNI-NEXT:    shrq $2, %r15
-; GFNI-NEXT:    andq %r10, %r15
-; GFNI-NEXT:    leaq (%r15,%r13,4), %r15
-; GFNI-NEXT:    movq %r15, %r13
-; GFNI-NEXT:    andq %r14, %r13
-; GFNI-NEXT:    shrq %r15
-; GFNI-NEXT:    andq %r14, %r15
-; GFNI-NEXT:    leaq (%r15,%r13,2), %r15
-; GFNI-NEXT:    shrdq $48, %r15, %rbx
-; GFNI-NEXT:    bswapq %r12
-; GFNI-NEXT:    movq %r12, %r13
-; GFNI-NEXT:    shrq $4, %r13
-; GFNI-NEXT:    andq %r11, %r13
-; GFNI-NEXT:    andq %r11, %r12
-; GFNI-NEXT:    shlq $4, %r12
-; GFNI-NEXT:    orq %r13, %r12
-; GFNI-NEXT:    movq %r12, %r13
-; GFNI-NEXT:    andq %r10, %r13
-; GFNI-NEXT:    shrq $2, %r12
-; GFNI-NEXT:    andq %r10, %r12
-; GFNI-NEXT:    leaq (%r12,%r13,4), %r12
-; GFNI-NEXT:    movq %r12, %r13
-; GFNI-NEXT:    andq %r14, %r13
-; GFNI-NEXT:    shrq %r12
-; GFNI-NEXT:    andq %r14, %r12
-; GFNI-NEXT:    leaq (%r12,%r13,2), %r12
-; GFNI-NEXT:    shrdq $48, %r12, %r15
-; GFNI-NEXT:    bswapq %r9
-; GFNI-NEXT:    movq %r9, %r13
-; GFNI-NEXT:    shrq $4, %r13
-; GFNI-NEXT:    andq %r11, %r13
-; GFNI-NEXT:    andq %r11, %r9
-; GFNI-NEXT:    shlq $4, %r9
-; GFNI-NEXT:    orq %r13, %r9
-; GFNI-NEXT:    movq %r9, %r13
-; GFNI-NEXT:    andq %r10, %r13
-; GFNI-NEXT:    shrq $2, %r9
-; GFNI-NEXT:    andq %r10, %r9
-; GFNI-NEXT:    leaq (%r9,%r13,4), %r9
-; GFNI-NEXT:    movq %r9, %r13
-; GFNI-NEXT:    andq %r14, %r13
-; GFNI-NEXT:    shrq %r9
-; GFNI-NEXT:    andq %r14, %r9
-; GFNI-NEXT:    leaq (%r9,%r13,2), %r9
-; GFNI-NEXT:    shrdq $48, %r9, %r12
+; GFNI-NEXT:    vmovq %r8, %xmm1
+; GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT:    vmovq %xmm1, %r8
 ; GFNI-NEXT:    bswapq %r8
-; GFNI-NEXT:    movq %r8, %r13
-; GFNI-NEXT:    shrq $4, %r13
-; GFNI-NEXT:    andq %r11, %r13
-; GFNI-NEXT:    andq %r11, %r8
-; GFNI-NEXT:    shlq $4, %r8
-; GFNI-NEXT:    orq %r13, %r8
-; GFNI-NEXT:    movq %r8, %r13
-; GFNI-NEXT:    andq %r10, %r13
-; GFNI-NEXT:    shrq $2, %r8
-; GFNI-NEXT:    andq %r10, %r8
-; GFNI-NEXT:    leaq (%r8,%r13,4), %r8
-; GFNI-NEXT:    movq %r8, %r13
-; GFNI-NEXT:    andq %r14, %r13
-; GFNI-NEXT:    shrq %r8
-; GFNI-NEXT:    andq %r14, %r8
-; GFNI-NEXT:    leaq (%r8,%r13,2), %r8
-; GFNI-NEXT:    shrdq $48, %r8, %r9
+; GFNI-NEXT:    movq %r8, %r9
+; GFNI-NEXT:    shldq $16, %rdi, %r9
+; GFNI-NEXT:    shldq $16, %r10, %rdi
+; GFNI-NEXT:    vmovq %rcx, %xmm1
+; GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT:    vmovq %xmm1, %rcx
 ; GFNI-NEXT:    bswapq %rcx
-; GFNI-NEXT:    movq %rcx, %r13
-; GFNI-NEXT:    shrq $4, %r13
-; GFNI-NEXT:    andq %r11, %r13
-; GFNI-NEXT:    andq %r11, %rcx
-; GFNI-NEXT:    shlq $4, %rcx
-; GFNI-NEXT:    orq %r13, %rcx
-; GFNI-NEXT:    movq %rcx, %r13
-; GFNI-NEXT:    andq %r10, %r13
-; GFNI-NEXT:    shrq $2, %rcx
-; GFNI-NEXT:    andq %r10, %rcx
-; GFNI-NEXT:    leaq (%rcx,%r13,4), %rcx
-; GFNI-NEXT:    movq %rcx, %r13
-; GFNI-NEXT:    andq %r14, %r13
-; GFNI-NEXT:    shrq %rcx
-; GFNI-NEXT:    andq %r14, %rcx
-; GFNI-NEXT:    leaq (%rcx,%r13,2), %rcx
 ; GFNI-NEXT:    shrdq $48, %rcx, %r8
+; GFNI-NEXT:    vmovq %rdx, %xmm1
+; GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT:    vmovq %xmm1, %rdx
 ; GFNI-NEXT:    bswapq %rdx
-; GFNI-NEXT:    movq %rdx, %r13
-; GFNI-NEXT:    shrq $4, %r13
-; GFNI-NEXT:    andq %r11, %r13
-; GFNI-NEXT:    andq %r11, %rdx
-; GFNI-NEXT:    shlq $4, %rdx
-; GFNI-NEXT:    orq %r13, %rdx
-; GFNI-NEXT:    movq %rdx, %r13
-; GFNI-NEXT:    andq %r10, %r13
-; GFNI-NEXT:    shrq $2, %rdx
-; GFNI-NEXT:    andq %r10, %rdx
-; GFNI-NEXT:    leaq (%rdx,%r13,4), %rdx
-; GFNI-NEXT:    movq %rdx, %r13
-; GFNI-NEXT:    andq %r14, %r13
-; GFNI-NEXT:    shrq %rdx
-; GFNI-NEXT:    andq %r14, %rdx
-; GFNI-NEXT:    leaq (%rdx,%r13,2), %rdx
 ; GFNI-NEXT:    shrdq $48, %rdx, %rcx
+; GFNI-NEXT:    vmovq %rsi, %xmm1
+; GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT:    vmovq %xmm1, %rsi
 ; GFNI-NEXT:    bswapq %rsi
-; GFNI-NEXT:    movq %rsi, %r13
-; GFNI-NEXT:    shrq $4, %r13
-; GFNI-NEXT:    andq %r11, %r13
-; GFNI-NEXT:    andq %r11, %rsi
-; GFNI-NEXT:    shlq $4, %rsi
-; GFNI-NEXT:    orq %r13, %rsi
-; GFNI-NEXT:    movq %rsi, %r11
-; GFNI-NEXT:    andq %r10, %r11
-; GFNI-NEXT:    shrq $2, %rsi
-; GFNI-NEXT:    andq %r10, %rsi
-; GFNI-NEXT:    leaq (%rsi,%r11,4), %rsi
-; GFNI-NEXT:    movq %rsi, %r10
-; GFNI-NEXT:    andq %r14, %r10
-; GFNI-NEXT:    shrq %rsi
-; GFNI-NEXT:    andq %r14, %rsi
-; GFNI-NEXT:    leaq (%rsi,%r10,2), %rsi
 ; GFNI-NEXT:    shrdq $48, %rsi, %rdx
+; GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT:    vmovq %xmm1, %r11
+; GFNI-NEXT:    bswapq %r11
+; GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT:    vmovq %xmm1, %rbx
+; GFNI-NEXT:    bswapq %rbx
+; GFNI-NEXT:    shrdq $48, %rbx, %r11
+; GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm0
+; GFNI-NEXT:    vmovq %xmm0, %r14
+; GFNI-NEXT:    bswapq %r14
+; GFNI-NEXT:    shrdq $48, %r14, %rbx
+; GFNI-NEXT:    shrdq $48, %r10, %r14
 ; GFNI-NEXT:    shrq $48, %rsi
+; GFNI-NEXT:    movq %r14, 16(%rax)
+; GFNI-NEXT:    movq %rbx, 8(%rax)
+; GFNI-NEXT:    movq %r11, (%rax)
 ; GFNI-NEXT:    movq %rdx, 56(%rax)
 ; GFNI-NEXT:    movq %rcx, 48(%rax)
 ; GFNI-NEXT:    movq %r8, 40(%rax)
 ; GFNI-NEXT:    movq %r9, 32(%rax)
-; GFNI-NEXT:    movq %r12, 24(%rax)
-; GFNI-NEXT:    movq %r15, 16(%rax)
-; GFNI-NEXT:    movq %rbx, 8(%rax)
-; GFNI-NEXT:    movq %rdi, (%rax)
+; GFNI-NEXT:    movq %rdi, 24(%rax)
 ; GFNI-NEXT:    movw %si, 64(%rax)
 ; GFNI-NEXT:    popq %rbx
-; GFNI-NEXT:    popq %r12
-; GFNI-NEXT:    popq %r13
 ; GFNI-NEXT:    popq %r14
-; GFNI-NEXT:    popq %r15
 ; GFNI-NEXT:    retq
   %Z = call i528 @llvm.bitreverse.i528(i528 %A)
   ret i528 %Z
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index d3f357cd179525..6354477abccfce 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -274,27 +274,57 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; GFNISSE-NEXT:    leal (%rax,%rcx,2), %eax
 ; GFNISSE-NEXT:    retq
 ;
-; GFNIAVX-LABEL: test_bitreverse_i32:
-; GFNIAVX:       # %bb.0:
-; GFNIAVX-NEXT:    # kill: def $edi killed $edi def $rdi
-; GFNIAVX-NEXT:    bswapl %edi
-; GFNIAVX-NEXT:    movl %edi, %eax
-; GFNIAVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; GFNIAVX-NEXT:    shll $4, %eax
-; GFNIAVX-NEXT:    shrl $4, %edi
-; GFNIAVX-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; GFNIAVX-NEXT:    orl %eax, %edi
-; GFNIAVX-NEXT:    movl %edi, %eax
-; GFNIAVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; GFNIAVX-NEXT:    shrl $2, %edi
-; GFNIAVX-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; GFNIAVX-NEXT:    leal (%rdi,%rax,4), %eax
-; GFNIAVX-NEXT:    movl %eax, %ecx
-; GFNIAVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; GFNIAVX-NEXT:    shrl %eax
-; GFNIAVX-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; GFNIAVX-NEXT:    leal (%rax,%rcx,2), %eax
-; GFNIAVX-NEXT:    retq
+; GFNIAVX1-LABEL: test_bitreverse_i32:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNIAVX1-NEXT:    bswapl %edi
+; GFNIAVX1-NEXT:    movl %edi, %eax
+; GFNIAVX1-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; GFNIAVX1-NEXT:    shll $4, %eax
+; GFNIAVX1-NEXT:    shrl $4, %edi
+; GFNIAVX1-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; GFNIAVX1-NEXT:    orl %eax, %edi
+; GFNIAVX1-NEXT:    movl %edi, %eax
+; GFNIAVX1-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; GFNIAVX1-NEXT:    shrl $2, %edi
+; GFNIAVX1-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; GFNIAVX1-NEXT:    leal (%rdi,%rax,4), %eax
+; GFNIAVX1-NEXT:    movl %eax, %ecx
+; GFNIAVX1-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; GFNIAVX1-NEXT:    shrl %eax
+; GFNIAVX1-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; GFNIAVX1-NEXT:    leal (%rax,%rcx,2), %eax
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_i32:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNIAVX2-NEXT:    bswapl %edi
+; GFNIAVX2-NEXT:    movl %edi, %eax
+; GFNIAVX2-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; GFNIAVX2-NEXT:    shll $4, %eax
+; GFNIAVX2-NEXT:    shrl $4, %edi
+; GFNIAVX2-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; GFNIAVX2-NEXT:    orl %eax, %edi
+; GFNIAVX2-NEXT:    movl %edi, %eax
+; GFNIAVX2-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; GFNIAVX2-NEXT:    shrl $2, %edi
+; GFNIAVX2-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; GFNIAVX2-NEXT:    leal (%rdi,%rax,4), %eax
+; GFNIAVX2-NEXT:    movl %eax, %ecx
+; GFNIAVX2-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; GFNIAVX2-NEXT:    shrl %eax
+; GFNIAVX2-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; GFNIAVX2-NEXT:    leal (%rax,%rcx,2), %eax
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512-LABEL: test_bitreverse_i32:
+; GFNIAVX512:       # %bb.0:
+; GFNIAVX512-NEXT:    vmovd %edi, %xmm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX512-NEXT:    vmovd %xmm0, %eax
+; GFNIAVX512-NEXT:    bswapl %eax
+; GFNIAVX512-NEXT:    retq
   %b = call i32 @llvm.bitreverse.i32(i32 %a)
   ret i32 %b
 }
@@ -379,29 +409,61 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; GFNISSE-NEXT:    leaq (%rax,%rdx,2), %rax
 ; GFNISSE-NEXT:    retq
 ;
-; GFNIAVX-LABEL: test_bitreverse_i64:
-; GFNIAVX:       # %bb.0:
-; GFNIAVX-NEXT:    bswapq %rdi
-; GFNIAVX-NEXT:    movq %rdi, %rax
-; GFNIAVX-NEXT:    shrq $4, %rax
-; GFNIAVX-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; GFNIAVX-NEXT:    andq %rcx, %rax
-; GFNIAVX-NEXT:    andq %rcx, %rdi
-; GFNIAVX-NEXT:    shlq $4, %rdi
-; GFNIAVX-NEXT:    orq %rax, %rdi
-; GFNIAVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; GFNIAVX-NEXT:    movq %rdi, %rcx
-; GFNIAVX-NEXT:    andq %rax, %rcx
-; GFNIAVX-NEXT:    shrq $2, %rdi
-; GFNIAVX-NEXT:    andq %rax, %rdi
-; GFNIAVX-NEXT:    leaq (%rdi,%rcx,4), %rax
-; GFNIAVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; GFNIAVX-NEXT:    movq %rax, %rdx
-; GFNIAVX-NEXT:    andq %rcx, %rdx
-; GFNIAVX-NEXT:    shrq %rax
-; GFNIAVX-NEXT:    andq %rcx, %rax
-; GFNIAVX-NEXT:    leaq (%rax,%rdx,2), %rax
-; GFNIAVX-NEXT:    retq
+; GFNIAVX1-LABEL: test_bitreverse_i64:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    bswapq %rdi
+; GFNIAVX1-NEXT:    movq %rdi, %rax
+; GFNIAVX1-NEXT:    shrq $4, %rax
+; GFNIAVX1-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
+; GFNIAVX1-NEXT:    andq %rcx, %rax
+; GFNIAVX1-NEXT:    andq %rcx, %rdi
+; GFNIAVX1-NEXT:    shlq $4, %rdi
+; GFNIAVX1-NEXT:    orq %rax, %rdi
+; GFNIAVX1-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX1-NEXT:    movq %rdi, %rcx
+; GFNIAVX1-NEXT:    andq %rax, %rcx
+; GFNIAVX1-NEXT:    shrq $2, %rdi
+; GFNIAVX1-NEXT:    andq %rax, %rdi
+; GFNIAVX1-NEXT:    leaq (%rdi,%rcx,4), %rax
+; GFNIAVX1-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX1-NEXT:    movq %rax, %rdx
+; GFNIAVX1-NEXT:    andq %rcx, %rdx
+; GFNIAVX1-NEXT:    shrq %rax
+; GFNIAVX1-NEXT:    andq %rcx, %rax
+; GFNIAVX1-NEXT:    leaq (%rax,%rdx,2), %rax
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_i64:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    bswapq %rdi
+; GFNIAVX2-NEXT:    movq %rdi, %rax
+; GFNIAVX2-NEXT:    shrq $4, %rax
+; GFNIAVX2-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
+; GFNIAVX2-NEXT:    andq %rcx, %rax
+; GFNIAVX2-NEXT:    andq %rcx, %rdi
+; GFNIAVX2-NEXT:    shlq $4, %rdi
+; GFNIAVX2-NEXT:    orq %rax, %rdi
+; GFNIAVX2-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX2-NEXT:    movq %rdi, %rcx
+; GFNIAVX2-NEXT:    andq %rax, %rcx
+; GFNIAVX2-NEXT:    shrq $2, %rdi
+; GFNIAVX2-NEXT:    andq %rax, %rdi
+; GFNIAVX2-NEXT:    leaq (%rdi,%rcx,4), %rax
+; GFNIAVX2-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX2-NEXT:    movq %rax, %rdx
+; GFNIAVX2-NEXT:    andq %rcx, %rdx
+; GFNIAVX2-NEXT:    shrq %rax
+; GFNIAVX2-NEXT:    andq %rcx, %rax
+; GFNIAVX2-NEXT:    leaq (%rax,%rdx,2), %rax
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512-LABEL: test_bitreverse_i64:
+; GFNIAVX512:       # %bb.0:
+; GFNIAVX512-NEXT:    vmovq %rdi, %xmm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX512-NEXT:    vmovq %xmm0, %rax
+; GFNIAVX512-NEXT:    bswapq %rax
+; GFNIAVX512-NEXT:    retq
   %b = call i64 @llvm.bitreverse.i64(i64 %a)
   ret i64 %b
 }

>From c04f702fa398935cd73a1679381aeafe3a137e0f Mon Sep 17 00:00:00 2001
From: shami <shami_thoke at yahoo.com>
Date: Thu, 15 Feb 2024 00:51:22 +0530
Subject: [PATCH 2/6] fix clang formatting.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 35370762fe701c..6dbee6757a4ff9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31320,7 +31320,7 @@ static auto createBSWAPShuffleMask(EVT VT) {
       ShuffleMask.push_back((I * ScalarSizeInBytes) + J);
 
   return ShuffleMask;
-} 
+}
 
 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {

>From bcde4ac284c37d6882445e8adf2c58377196d63b Mon Sep 17 00:00:00 2001
From: shami <shami_thoke at yahoo.com>
Date: Tue, 27 Feb 2024 22:11:18 +0530
Subject: [PATCH 3/6] Address review comments.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6dbee6757a4ff9..01f1ded63b6bc2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31312,14 +31312,11 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
   return DAG.getBitcast(VT, Res);
 }
 
-static auto createBSWAPShuffleMask(EVT VT) {
-  SmallVector<int, 16> ShuffleMask;
+static void createBSWAPShuffleMask(EVT VT, SmallVector<int, 16>& ShuffleMask) {
   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
   for (int I = 0, E = VT.getVectorNumElements(); I != E; ++I)
     for (int J = ScalarSizeInBytes - 1; J >= 0; --J)
       ShuffleMask.push_back((I * ScalarSizeInBytes) + J);
-
-  return ShuffleMask;
 }
 
 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
@@ -31334,7 +31331,7 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   if (HasGFNI && ((ScalarType == MVT::i32) || (ScalarType == MVT::i64))) {
     if (VT.isVector()) {
       SmallVector<int, 16> BSWAPMask = createBSWAPShuffleMask(VT);
-      EVT ByteVT =
+      MVT ByteVT =
           EVT::getVectorVT(*DAG.getContext(), MVT::i8, BSWAPMask.size());
       SDValue VecShuffle = DAG.getVectorShuffle(
           ByteVT, DL, DAG.getNode(ISD::BITCAST, DL, ByteVT, In),
@@ -31355,14 +31352,6 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
-  if (Subtarget.hasXOP() && !VT.is512BitVector())
-    return LowerBITREVERSE_XOP(Op, DAG);
-
-  assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
-
-  assert(VT.getScalarType() == MVT::i8 &&
-         "Only byte vector BITREVERSE supported");
-
   // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
 
   if (Subtarget.hasXOP() && !VT.is512BitVector())

>From f5dbf6615a2793fa9fcf7320ee99325298ed465b Mon Sep 17 00:00:00 2001
From: shami <shami_thoke at yahoo.com>
Date: Wed, 10 Apr 2024 19:36:25 +0530
Subject: [PATCH 4/6] i32/i64 support after code restructure for vectors.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp    |  63 +++++------
 llvm/test/CodeGen/X86/vector-bitreverse.ll | 120 +++------------------
 2 files changed, 38 insertions(+), 145 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 01f1ded63b6bc2..d5a29321d0959b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1496,6 +1496,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::TRUNCATE,          MVT::v32i32, Custom);
     setOperationAction(ISD::TRUNCATE,          MVT::v32i64, Custom);
 
+    if (Subtarget.hasGFNI()) {
+      setOperationAction(ISD::BITREVERSE,      MVT::i32, Custom);
+      setOperationAction(ISD::BITREVERSE,      MVT::i64, Custom);
+    }
+
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SETCC,           VT, Custom);
       setOperationAction(ISD::CTPOP,           VT, Custom);
@@ -31312,53 +31317,18 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
   return DAG.getBitcast(VT, Res);
 }
 
-static void createBSWAPShuffleMask(EVT VT, SmallVector<int, 16>& ShuffleMask) {
-  int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
-  for (int I = 0, E = VT.getVectorNumElements(); I != E; ++I)
-    for (int J = ScalarSizeInBytes - 1; J >= 0; --J)
-      ShuffleMask.push_back((I * ScalarSizeInBytes) + J);
-}
-
 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
-  SDValue In = Op.getOperand(0);
-  SDLoc DL(Op);
-
-  auto HasGFNI = Subtarget.hasGFNI();
-  auto ScalarType = VT.getScalarType();
-
-  if (HasGFNI && ((ScalarType == MVT::i32) || (ScalarType == MVT::i64))) {
-    if (VT.isVector()) {
-      SmallVector<int, 16> BSWAPMask = createBSWAPShuffleMask(VT);
-      MVT ByteVT =
-          EVT::getVectorVT(*DAG.getContext(), MVT::i8, BSWAPMask.size());
-      SDValue VecShuffle = DAG.getVectorShuffle(
-          ByteVT, DL, DAG.getNode(ISD::BITCAST, DL, ByteVT, In),
-          DAG.getUNDEF(ByteVT), BSWAPMask);
-      SDValue BitReverse = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, VecShuffle);
-      return DAG.getBitcast(VT, BitReverse);
-    } else {
-      auto CastTo = ScalarType == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
-      SDValue ScalarToVector =
-          DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, CastTo, In);
-      SDValue BitReverse =
-          DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
-                      DAG.getBitcast(MVT::v16i8, ScalarToVector));
-      SDValue ExtractElementZero = DAG.getNode(
-          ISD::EXTRACT_VECTOR_ELT, DL, ScalarType,
-          DAG.getBitcast(CastTo, BitReverse), DAG.getIntPtrConstant(0, DL));
-      return DAG.getNode(ISD::BSWAP, DL, ScalarType, ExtractElementZero);
-    }
-  }
-
-  // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
 
   if (Subtarget.hasXOP() && !VT.is512BitVector())
     return LowerBITREVERSE_XOP(Op, DAG);
 
   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
 
+  SDValue In = Op.getOperand(0);
+  SDLoc DL(Op);
+
   // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
   if (VT.is512BitVector() && !Subtarget.hasBWI())
     return splitVectorIntUnary(Op, DAG, DL);
@@ -31367,6 +31337,21 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return splitVectorIntUnary(Op, DAG, DL);
 
+  // Lower i32/i64 to GFNI as i32/i64 -> Convert to vector (V = v16i32/v8i64) -> vXi8 BITREVERSE -> V[0] -> BSWAP
+  if (Subtarget.hasGFNI() && !VT.isVector()) {
+
+    assert ((VT.getScalarType() == MVT::i32) || (VT.getScalarType() == MVT::i64));
+
+    auto ScalarType = VT.getScalarType();
+    auto CastTo = ScalarType == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
+    SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, CastTo, In);
+    Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8, DAG.getBitcast(MVT::v16i8, Res));
+    Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarType, DAG.getBitcast(CastTo, Res), DAG.getIntPtrConstant(0, DL));
+    return DAG.getNode(ISD::BSWAP, DL, ScalarType, Res);
+  }
+
+  assert (VT.isVector() && VT.getSizeInBits() >= 128);
+
   // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
   if (VT.getScalarType() != MVT::i8) {
     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
@@ -31381,7 +31366,7 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   unsigned NumElts = VT.getVectorNumElements();
 
   // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
-  if (HasGFNI) {
+  if (Subtarget.hasGFNI()) {
     MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
     SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
     Matrix = DAG.getBitcast(VT, Matrix);
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index 6354477abccfce..1c5326d35bb00a 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -274,57 +274,13 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; GFNISSE-NEXT:    leal (%rax,%rcx,2), %eax
 ; GFNISSE-NEXT:    retq
 ;
-; GFNIAVX1-LABEL: test_bitreverse_i32:
-; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    # kill: def $edi killed $edi def $rdi
-; GFNIAVX1-NEXT:    bswapl %edi
-; GFNIAVX1-NEXT:    movl %edi, %eax
-; GFNIAVX1-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; GFNIAVX1-NEXT:    shll $4, %eax
-; GFNIAVX1-NEXT:    shrl $4, %edi
-; GFNIAVX1-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; GFNIAVX1-NEXT:    orl %eax, %edi
-; GFNIAVX1-NEXT:    movl %edi, %eax
-; GFNIAVX1-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; GFNIAVX1-NEXT:    shrl $2, %edi
-; GFNIAVX1-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; GFNIAVX1-NEXT:    leal (%rdi,%rax,4), %eax
-; GFNIAVX1-NEXT:    movl %eax, %ecx
-; GFNIAVX1-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; GFNIAVX1-NEXT:    shrl %eax
-; GFNIAVX1-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; GFNIAVX1-NEXT:    leal (%rax,%rcx,2), %eax
-; GFNIAVX1-NEXT:    retq
-;
-; GFNIAVX2-LABEL: test_bitreverse_i32:
-; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    # kill: def $edi killed $edi def $rdi
-; GFNIAVX2-NEXT:    bswapl %edi
-; GFNIAVX2-NEXT:    movl %edi, %eax
-; GFNIAVX2-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; GFNIAVX2-NEXT:    shll $4, %eax
-; GFNIAVX2-NEXT:    shrl $4, %edi
-; GFNIAVX2-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; GFNIAVX2-NEXT:    orl %eax, %edi
-; GFNIAVX2-NEXT:    movl %edi, %eax
-; GFNIAVX2-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; GFNIAVX2-NEXT:    shrl $2, %edi
-; GFNIAVX2-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; GFNIAVX2-NEXT:    leal (%rdi,%rax,4), %eax
-; GFNIAVX2-NEXT:    movl %eax, %ecx
-; GFNIAVX2-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; GFNIAVX2-NEXT:    shrl %eax
-; GFNIAVX2-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; GFNIAVX2-NEXT:    leal (%rax,%rcx,2), %eax
-; GFNIAVX2-NEXT:    retq
-;
-; GFNIAVX512-LABEL: test_bitreverse_i32:
-; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vmovd %edi, %xmm0
-; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX512-NEXT:    vmovd %xmm0, %eax
-; GFNIAVX512-NEXT:    bswapl %eax
-; GFNIAVX512-NEXT:    retq
+; GFNIAVX-LABEL: test_bitreverse_i32:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vmovd %edi, %xmm0
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX-NEXT:    vmovd %xmm0, %eax
+; GFNIAVX-NEXT:    bswapl %eax
+; GFNIAVX-NEXT:    retq
   %b = call i32 @llvm.bitreverse.i32(i32 %a)
   ret i32 %b
 }
@@ -409,61 +365,13 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; GFNISSE-NEXT:    leaq (%rax,%rdx,2), %rax
 ; GFNISSE-NEXT:    retq
 ;
-; GFNIAVX1-LABEL: test_bitreverse_i64:
-; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    bswapq %rdi
-; GFNIAVX1-NEXT:    movq %rdi, %rax
-; GFNIAVX1-NEXT:    shrq $4, %rax
-; GFNIAVX1-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; GFNIAVX1-NEXT:    andq %rcx, %rax
-; GFNIAVX1-NEXT:    andq %rcx, %rdi
-; GFNIAVX1-NEXT:    shlq $4, %rdi
-; GFNIAVX1-NEXT:    orq %rax, %rdi
-; GFNIAVX1-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; GFNIAVX1-NEXT:    movq %rdi, %rcx
-; GFNIAVX1-NEXT:    andq %rax, %rcx
-; GFNIAVX1-NEXT:    shrq $2, %rdi
-; GFNIAVX1-NEXT:    andq %rax, %rdi
-; GFNIAVX1-NEXT:    leaq (%rdi,%rcx,4), %rax
-; GFNIAVX1-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; GFNIAVX1-NEXT:    movq %rax, %rdx
-; GFNIAVX1-NEXT:    andq %rcx, %rdx
-; GFNIAVX1-NEXT:    shrq %rax
-; GFNIAVX1-NEXT:    andq %rcx, %rax
-; GFNIAVX1-NEXT:    leaq (%rax,%rdx,2), %rax
-; GFNIAVX1-NEXT:    retq
-;
-; GFNIAVX2-LABEL: test_bitreverse_i64:
-; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    bswapq %rdi
-; GFNIAVX2-NEXT:    movq %rdi, %rax
-; GFNIAVX2-NEXT:    shrq $4, %rax
-; GFNIAVX2-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; GFNIAVX2-NEXT:    andq %rcx, %rax
-; GFNIAVX2-NEXT:    andq %rcx, %rdi
-; GFNIAVX2-NEXT:    shlq $4, %rdi
-; GFNIAVX2-NEXT:    orq %rax, %rdi
-; GFNIAVX2-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; GFNIAVX2-NEXT:    movq %rdi, %rcx
-; GFNIAVX2-NEXT:    andq %rax, %rcx
-; GFNIAVX2-NEXT:    shrq $2, %rdi
-; GFNIAVX2-NEXT:    andq %rax, %rdi
-; GFNIAVX2-NEXT:    leaq (%rdi,%rcx,4), %rax
-; GFNIAVX2-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; GFNIAVX2-NEXT:    movq %rax, %rdx
-; GFNIAVX2-NEXT:    andq %rcx, %rdx
-; GFNIAVX2-NEXT:    shrq %rax
-; GFNIAVX2-NEXT:    andq %rcx, %rax
-; GFNIAVX2-NEXT:    leaq (%rax,%rdx,2), %rax
-; GFNIAVX2-NEXT:    retq
-;
-; GFNIAVX512-LABEL: test_bitreverse_i64:
-; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vmovq %rdi, %xmm0
-; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX512-NEXT:    vmovq %xmm0, %rax
-; GFNIAVX512-NEXT:    bswapq %rax
-; GFNIAVX512-NEXT:    retq
+; GFNIAVX-LABEL: test_bitreverse_i64:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vmovq %rdi, %xmm0
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX-NEXT:    vmovq %xmm0, %rax
+; GFNIAVX-NEXT:    bswapq %rax
+; GFNIAVX-NEXT:    retq
   %b = call i64 @llvm.bitreverse.i64(i64 %a)
   ret i64 %b
 }

>From 163e6a60afe6d617a43b8e06a49835f0ee63db86 Mon Sep 17 00:00:00 2001
From: shami <shami_thoke at yahoo.com>
Date: Wed, 10 Apr 2024 21:13:34 +0530
Subject: [PATCH 5/6] Address review comments (2).

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d5a29321d0959b..60f2d8768041b9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31337,17 +31337,17 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return splitVectorIntUnary(Op, DAG, DL);
 
-  // Lower i32/i64 to GFNI as i32/i64 -> Convert to vector (V = v16i32/v8i64) -> vXi8 BITREVERSE -> V[0] -> BSWAP
-  if (Subtarget.hasGFNI() && !VT.isVector()) {
+  // Lower i32/i64 to GFNI as vXi8 BITREVERSE + BSWAP
+  if (!VT.isVector()) {
 
     assert ((VT.getScalarType() == MVT::i32) || (VT.getScalarType() == MVT::i64));
 
-    auto ScalarType = VT.getScalarType();
-    auto CastTo = ScalarType == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
-    SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, CastTo, In);
+    MVT SVT = VT.getScalarType();
+    MVT VecVT = MVT::getVectorVT(SVT, 128 / SVT.getSizeInBits());
+    SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
     Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8, DAG.getBitcast(MVT::v16i8, Res));
-    Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarType, DAG.getBitcast(CastTo, Res), DAG.getIntPtrConstant(0, DL));
-    return DAG.getNode(ISD::BSWAP, DL, ScalarType, Res);
+    Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, DAG.getBitcast(VecVT, Res), DAG.getIntPtrConstant(0, DL));
+    return DAG.getNode(ISD::BSWAP, DL, SVT, Res);
   }
 
   assert (VT.isVector() && VT.getSizeInBits() >= 128);

>From e8f2236bd273f8f17206d919bedd6f41a906a194 Mon Sep 17 00:00:00 2001
From: shami <shami_thoke at yahoo.com>
Date: Wed, 10 Apr 2024 23:11:28 +0530
Subject: [PATCH 6/6] Formatting changes.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 60f2d8768041b9..01e336c6789f5d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1497,8 +1497,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::TRUNCATE,          MVT::v32i64, Custom);
 
     if (Subtarget.hasGFNI()) {
-      setOperationAction(ISD::BITREVERSE,      MVT::i32, Custom);
-      setOperationAction(ISD::BITREVERSE,      MVT::i64, Custom);
+      setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
+      setOperationAction(ISD::BITREVERSE, MVT::i64, Custom);
     }
 
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
@@ -31340,17 +31340,20 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   // Lower i32/i64 to GFNI as vXi8 BITREVERSE + BSWAP
   if (!VT.isVector()) {
 
-    assert ((VT.getScalarType() == MVT::i32) || (VT.getScalarType() == MVT::i64));
+    assert((VT.getScalarType() == MVT::i32) ||
+           (VT.getScalarType() == MVT::i64));
 
     MVT SVT = VT.getScalarType();
     MVT VecVT = MVT::getVectorVT(SVT, 128 / SVT.getSizeInBits());
     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
-    Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8, DAG.getBitcast(MVT::v16i8, Res));
-    Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, DAG.getBitcast(VecVT, Res), DAG.getIntPtrConstant(0, DL));
+    Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
+                      DAG.getBitcast(MVT::v16i8, Res));
+    Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT,
+                      DAG.getBitcast(VecVT, Res), DAG.getIntPtrConstant(0, DL));
     return DAG.getNode(ISD::BSWAP, DL, SVT, Res);
   }
 
-  assert (VT.isVector() && VT.getSizeInBits() >= 128);
+  assert(VT.isVector() && VT.getSizeInBits() >= 128);
 
   // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
   if (VT.getScalarType() != MVT::i8) {