[llvm] f0b0bab - [X86] Use GF2P8AFFINEQB to implement vector bitreverse.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 17 23:50:05 PST 2020


Author: Craig Topper
Date: 2020-11-17T23:49:06-08:00
New Revision: f0b0bab34dd82c07d153d31363d89ebd04cc733d

URL: https://github.com/llvm/llvm-project/commit/f0b0bab34dd82c07d153d31363d89ebd04cc733d
DIFF: https://github.com/llvm/llvm-project/commit/f0b0bab34dd82c07d153d31363d89ebd04cc733d.diff

LOG: [X86] Use GF2P8AFFINEQB to implement vector bitreverse.

We can use GF2P8AFFINEQB to reverse bits in a byte. Shuffles are needed to reverse the bytes in elements larger than i8. LegalizeVectorOps takes care of inserting the shuffle for the larger element size.

We already have Custom lowering for v16i8 with SSSE3, v32i8 with AVX, and v64i8 with AVX512BW.

I think we might be able to use this for scalars too by moving into a vector and back. But I'll save that for a follow up as its a little more involved.

Reviewed By: RKSimon, pengfei

Differential Revision: https://reviews.llvm.org/D91515

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/X86/bitreverse.ll
    llvm/test/CodeGen/X86/vector-bitreverse.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index be77051cf82e..41b58af023b3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28977,18 +28977,28 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   SDValue In = Op.getOperand(0);
   SDLoc DL(Op);
 
+  assert(VT.getScalarType() == MVT::i8 &&
+         "Only byte vector BITREVERSE supported");
+
   // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
   if (VT == MVT::v64i8 && !Subtarget.hasBWI())
     return splitVectorIntUnary(Op, DAG);
 
-  unsigned NumElts = VT.getVectorNumElements();
-  assert(VT.getScalarType() == MVT::i8 &&
-         "Only byte vector BITREVERSE supported");
-
   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
-  if (VT.is256BitVector() && !Subtarget.hasInt256())
+  if (VT == MVT::v32i8 && !Subtarget.hasInt256())
     return splitVectorIntUnary(Op, DAG);
 
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
+  if (Subtarget.hasGFNI()) {
+    MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
+    SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
+    Matrix = DAG.getBitcast(VT, Matrix);
+    return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
+                       DAG.getTargetConstant(0, DL, MVT::i8));
+  }
+
   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
   // two nibbles and a PSHUFB lookup to find the bitreverse of each
   // 0-15 value (moved to the other nibble).

diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 978936996132..2c35f74ab95e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2783,6 +2783,22 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     MVT MTy = LT.second;
 
     // Attempt to lookup cost.
+    if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
+        MTy.isVector()) {
+      // With PSHUFB the code is very similar for all types. If we have integer
+      // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
+      // we also need a PSHUFB.
+      unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
+
+      // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
+      // instructions. We also need an extract and an insert.
+      if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
+            (ST->hasBWI() && MTy.is512BitVector())))
+        Cost = Cost * 2 + 2;
+
+      return LT.first * Cost;
+    }
+
     if (ST->useGLMDivSqrtCosts())
       if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
         return LT.first * Entry->Cost;

diff  --git a/llvm/test/Analysis/CostModel/X86/bitreverse.ll b/llvm/test/Analysis/CostModel/X86/bitreverse.ll
index 2497c4bc4fa1..745da07da101 100644
--- a/llvm/test/Analysis/CostModel/X86/bitreverse.ll
+++ b/llvm/test/Analysis/CostModel/X86/bitreverse.ll
@@ -13,6 +13,11 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=X64,AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+xop | FileCheck %s -check-prefixes=XOP
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+xop,+avx2 | FileCheck %s -check-prefixes=XOP
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+gfni,+ssse3 | FileCheck %s -check-prefixes=GFNISSE
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+gfni,+avx | FileCheck %s -check-prefixes=GFNIAVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+gfni,+avx2 | FileCheck %s -check-prefixes=GFNIAVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+gfni,+avx512f | FileCheck %s -check-prefixes=GFNIAVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+gfni,+avx512bw | FileCheck %s -check-prefixes=GFNIAVX512BW
 
 ; Verify the cost of scalar bitreverse instructions.
 
@@ -33,6 +38,26 @@ define i64 @var_bitreverse_i64(i64 %a) {
 ; XOP-LABEL: 'var_bitreverse_i64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_i64'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_i64'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_i64'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_i64'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_i64'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse
 ;
   %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
   ret i64 %bitreverse
@@ -50,6 +75,26 @@ define i32 @var_bitreverse_i32(i32 %a) {
 ; XOP-LABEL: 'var_bitreverse_i32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_i32'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_i32'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_i32'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_i32'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_i32'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse
 ;
   %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
   ret i32 %bitreverse
@@ -67,6 +112,26 @@ define i16 @var_bitreverse_i16(i16 %a) {
 ; XOP-LABEL: 'var_bitreverse_i16'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_i16'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_i16'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_i16'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_i16'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_i16'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
 ;
   %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
   ret i16 %bitreverse
@@ -84,6 +149,26 @@ define i8 @var_bitreverse_i8(i8 %a) {
 ; XOP-LABEL: 'var_bitreverse_i8'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_i8'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_i8'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_i8'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_i8'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_i8'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
 ;
   %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
   ret i8 %bitreverse
@@ -126,6 +211,26 @@ define <2 x i64> @var_bitreverse_v2i64(<2 x i64> %a) {
 ; XOP-LABEL: 'var_bitreverse_v2i64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_v2i64'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_v2i64'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_v2i64'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_v2i64'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_v2i64'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
 ;
   %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
   ret <2 x i64> %bitreverse
@@ -155,6 +260,26 @@ define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) {
 ; XOP-LABEL: 'var_bitreverse_v4i64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_v4i64'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_v4i64'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_v4i64'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_v4i64'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_v4i64'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
 ;
   %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
   ret <4 x i64> %bitreverse
@@ -188,6 +313,26 @@ define <8 x i64> @var_bitreverse_v8i64(<8 x i64> %a) {
 ; XOP-LABEL: 'var_bitreverse_v8i64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_v8i64'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_v8i64'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_v8i64'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_v8i64'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i64'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
 ;
   %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
   ret <8 x i64> %bitreverse
@@ -213,6 +358,26 @@ define <4 x i32> @var_bitreverse_v4i32(<4 x i32> %a) {
 ; XOP-LABEL: 'var_bitreverse_v4i32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_v4i32'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_v4i32'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_v4i32'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_v4i32'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_v4i32'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
 ;
   %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
   ret <4 x i32> %bitreverse
@@ -242,6 +407,26 @@ define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) {
 ; XOP-LABEL: 'var_bitreverse_v8i32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_v8i32'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_v8i32'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_v8i32'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_v8i32'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i32'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
 ;
   %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
   ret <8 x i32> %bitreverse
@@ -275,6 +460,26 @@ define <16 x i32> @var_bitreverse_v16i32(<16 x i32> %a) {
 ; XOP-LABEL: 'var_bitreverse_v16i32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_v16i32'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_v16i32'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_v16i32'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_v16i32'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i32'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
 ;
   %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
   ret <16 x i32> %bitreverse
@@ -300,6 +505,26 @@ define <8 x i16> @var_bitreverse_v8i16(<8 x i16> %a) {
 ; XOP-LABEL: 'var_bitreverse_v8i16'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_v8i16'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_v8i16'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_v8i16'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_v8i16'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i16'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
 ;
   %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
   ret <8 x i16> %bitreverse
@@ -329,6 +554,26 @@ define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) {
 ; XOP-LABEL: 'var_bitreverse_v16i16'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_v16i16'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_v16i16'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_v16i16'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_v16i16'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i16'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
 ;
   %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
   ret <16 x i16> %bitreverse
@@ -362,6 +607,26 @@ define <32 x i16> @var_bitreverse_v32i16(<32 x i16> %a) {
 ; XOP-LABEL: 'var_bitreverse_v32i16'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_v32i16'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_v32i16'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_v32i16'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_v32i16'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_v32i16'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
 ;
   %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
   ret <32 x i16> %bitreverse
@@ -387,6 +652,26 @@ define <16 x i8> @var_bitreverse_v16i8(<16 x i8> %a) {
 ; XOP-LABEL: 'var_bitreverse_v16i8'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_v16i8'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_v16i8'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_v16i8'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_v16i8'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i8'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bitreverse
 ;
   %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
   ret <16 x i8> %bitreverse
@@ -416,6 +701,26 @@ define <32 x i8> @var_bitreverse_v32i8(<32 x i8> %a) {
 ; XOP-LABEL: 'var_bitreverse_v32i8'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_v32i8'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_v32i8'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_v32i8'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_v32i8'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_v32i8'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bitreverse
 ;
   %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
   ret <32 x i8> %bitreverse
@@ -449,6 +754,26 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) {
 ; XOP-LABEL: 'var_bitreverse_v64i8'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse
+;
+; GFNISSE-LABEL: 'var_bitreverse_v64i8'
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse
+;
+; GFNIAVX-LABEL: 'var_bitreverse_v64i8'
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse
+;
+; GFNIAVX2-LABEL: 'var_bitreverse_v64i8'
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse
+;
+; GFNIAVX512F-LABEL: 'var_bitreverse_v64i8'
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse
+;
+; GFNIAVX512BW-LABEL: 'var_bitreverse_v64i8'
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse
 ;
   %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
   ret <64 x i8> %bitreverse

diff  --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index f99fc38c6625..a5af0fa53bca 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -7,6 +7,11 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNISSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512BW
 
 ; Make sure we don't crash with avx512bw and xop
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw
@@ -57,6 +62,101 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; XOP-NEXT:    vmovd %xmm0, %eax
 ; XOP-NEXT:    # kill: def $al killed $al killed $eax
 ; XOP-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNISSE-NEXT:    rolb $4, %dil
+; GFNISSE-NEXT:    movl %edi, %eax
+; GFNISSE-NEXT:    andb $51, %al
+; GFNISSE-NEXT:    shlb $2, %al
+; GFNISSE-NEXT:    andb $-52, %dil
+; GFNISSE-NEXT:    shrb $2, %dil
+; GFNISSE-NEXT:    orb %al, %dil
+; GFNISSE-NEXT:    movl %edi, %eax
+; GFNISSE-NEXT:    andb $85, %al
+; GFNISSE-NEXT:    addb %al, %al
+; GFNISSE-NEXT:    andb $-86, %dil
+; GFNISSE-NEXT:    shrb %dil
+; GFNISSE-NEXT:    addl %edi, %eax
+; GFNISSE-NEXT:    # kill: def $al killed $al killed $eax
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_i8:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNIAVX-NEXT:    rolb $4, %dil
+; GFNIAVX-NEXT:    movl %edi, %eax
+; GFNIAVX-NEXT:    andb $51, %al
+; GFNIAVX-NEXT:    shlb $2, %al
+; GFNIAVX-NEXT:    andb $-52, %dil
+; GFNIAVX-NEXT:    shrb $2, %dil
+; GFNIAVX-NEXT:    orb %al, %dil
+; GFNIAVX-NEXT:    movl %edi, %eax
+; GFNIAVX-NEXT:    andb $85, %al
+; GFNIAVX-NEXT:    addb %al, %al
+; GFNIAVX-NEXT:    andb $-86, %dil
+; GFNIAVX-NEXT:    shrb %dil
+; GFNIAVX-NEXT:    addl %edi, %eax
+; GFNIAVX-NEXT:    # kill: def $al killed $al killed $eax
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNIAVX2-NEXT:    rolb $4, %dil
+; GFNIAVX2-NEXT:    movl %edi, %eax
+; GFNIAVX2-NEXT:    andb $51, %al
+; GFNIAVX2-NEXT:    shlb $2, %al
+; GFNIAVX2-NEXT:    andb $-52, %dil
+; GFNIAVX2-NEXT:    shrb $2, %dil
+; GFNIAVX2-NEXT:    orb %al, %dil
+; GFNIAVX2-NEXT:    movl %edi, %eax
+; GFNIAVX2-NEXT:    andb $85, %al
+; GFNIAVX2-NEXT:    addb %al, %al
+; GFNIAVX2-NEXT:    andb $-86, %dil
+; GFNIAVX2-NEXT:    shrb %dil
+; GFNIAVX2-NEXT:    addl %edi, %eax
+; GFNIAVX2-NEXT:    # kill: def $al killed $al killed $eax
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_i8:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNIAVX512F-NEXT:    rolb $4, %dil
+; GFNIAVX512F-NEXT:    movl %edi, %eax
+; GFNIAVX512F-NEXT:    andb $51, %al
+; GFNIAVX512F-NEXT:    shlb $2, %al
+; GFNIAVX512F-NEXT:    andb $-52, %dil
+; GFNIAVX512F-NEXT:    shrb $2, %dil
+; GFNIAVX512F-NEXT:    orb %al, %dil
+; GFNIAVX512F-NEXT:    movl %edi, %eax
+; GFNIAVX512F-NEXT:    andb $85, %al
+; GFNIAVX512F-NEXT:    addb %al, %al
+; GFNIAVX512F-NEXT:    andb $-86, %dil
+; GFNIAVX512F-NEXT:    shrb %dil
+; GFNIAVX512F-NEXT:    addl %edi, %eax
+; GFNIAVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNIAVX512BW-NEXT:    rolb $4, %dil
+; GFNIAVX512BW-NEXT:    movl %edi, %eax
+; GFNIAVX512BW-NEXT:    andb $51, %al
+; GFNIAVX512BW-NEXT:    shlb $2, %al
+; GFNIAVX512BW-NEXT:    andb $-52, %dil
+; GFNIAVX512BW-NEXT:    shrb $2, %dil
+; GFNIAVX512BW-NEXT:    orb %al, %dil
+; GFNIAVX512BW-NEXT:    movl %edi, %eax
+; GFNIAVX512BW-NEXT:    andb $85, %al
+; GFNIAVX512BW-NEXT:    addb %al, %al
+; GFNIAVX512BW-NEXT:    andb $-86, %dil
+; GFNIAVX512BW-NEXT:    shrb %dil
+; GFNIAVX512BW-NEXT:    addl %edi, %eax
+; GFNIAVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; GFNIAVX512BW-NEXT:    retq
   %b = call i8 @llvm.bitreverse.i8(i8 %a)
   ret i8 %b
 }
@@ -115,6 +215,121 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; XOP-NEXT:    vmovd %xmm0, %eax
 ; XOP-NEXT:    # kill: def $ax killed $ax killed $eax
 ; XOP-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_i16:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNISSE-NEXT:    rolw $8, %di
+; GFNISSE-NEXT:    movl %edi, %eax
+; GFNISSE-NEXT:    andl $3855, %eax # imm = 0xF0F
+; GFNISSE-NEXT:    shll $4, %eax
+; GFNISSE-NEXT:    andl $61680, %edi # imm = 0xF0F0
+; GFNISSE-NEXT:    shrl $4, %edi
+; GFNISSE-NEXT:    orl %eax, %edi
+; GFNISSE-NEXT:    movl %edi, %eax
+; GFNISSE-NEXT:    andl $13107, %eax # imm = 0x3333
+; GFNISSE-NEXT:    andl $52428, %edi # imm = 0xCCCC
+; GFNISSE-NEXT:    shrl $2, %edi
+; GFNISSE-NEXT:    leal (%rdi,%rax,4), %eax
+; GFNISSE-NEXT:    movl %eax, %ecx
+; GFNISSE-NEXT:    andl $21845, %ecx # imm = 0x5555
+; GFNISSE-NEXT:    andl $43690, %eax # imm = 0xAAAA
+; GFNISSE-NEXT:    shrl %eax
+; GFNISSE-NEXT:    leal (%rax,%rcx,2), %eax
+; GFNISSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_i16:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNIAVX-NEXT:    rolw $8, %di
+; GFNIAVX-NEXT:    movl %edi, %eax
+; GFNIAVX-NEXT:    andl $3855, %eax # imm = 0xF0F
+; GFNIAVX-NEXT:    shll $4, %eax
+; GFNIAVX-NEXT:    andl $61680, %edi # imm = 0xF0F0
+; GFNIAVX-NEXT:    shrl $4, %edi
+; GFNIAVX-NEXT:    orl %eax, %edi
+; GFNIAVX-NEXT:    movl %edi, %eax
+; GFNIAVX-NEXT:    andl $13107, %eax # imm = 0x3333
+; GFNIAVX-NEXT:    andl $52428, %edi # imm = 0xCCCC
+; GFNIAVX-NEXT:    shrl $2, %edi
+; GFNIAVX-NEXT:    leal (%rdi,%rax,4), %eax
+; GFNIAVX-NEXT:    movl %eax, %ecx
+; GFNIAVX-NEXT:    andl $21845, %ecx # imm = 0x5555
+; GFNIAVX-NEXT:    andl $43690, %eax # imm = 0xAAAA
+; GFNIAVX-NEXT:    shrl %eax
+; GFNIAVX-NEXT:    leal (%rax,%rcx,2), %eax
+; GFNIAVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_i16:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNIAVX2-NEXT:    rolw $8, %di
+; GFNIAVX2-NEXT:    movl %edi, %eax
+; GFNIAVX2-NEXT:    andl $3855, %eax # imm = 0xF0F
+; GFNIAVX2-NEXT:    shll $4, %eax
+; GFNIAVX2-NEXT:    andl $61680, %edi # imm = 0xF0F0
+; GFNIAVX2-NEXT:    shrl $4, %edi
+; GFNIAVX2-NEXT:    orl %eax, %edi
+; GFNIAVX2-NEXT:    movl %edi, %eax
+; GFNIAVX2-NEXT:    andl $13107, %eax # imm = 0x3333
+; GFNIAVX2-NEXT:    andl $52428, %edi # imm = 0xCCCC
+; GFNIAVX2-NEXT:    shrl $2, %edi
+; GFNIAVX2-NEXT:    leal (%rdi,%rax,4), %eax
+; GFNIAVX2-NEXT:    movl %eax, %ecx
+; GFNIAVX2-NEXT:    andl $21845, %ecx # imm = 0x5555
+; GFNIAVX2-NEXT:    andl $43690, %eax # imm = 0xAAAA
+; GFNIAVX2-NEXT:    shrl %eax
+; GFNIAVX2-NEXT:    leal (%rax,%rcx,2), %eax
+; GFNIAVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_i16:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNIAVX512F-NEXT:    rolw $8, %di
+; GFNIAVX512F-NEXT:    movl %edi, %eax
+; GFNIAVX512F-NEXT:    andl $3855, %eax # imm = 0xF0F
+; GFNIAVX512F-NEXT:    shll $4, %eax
+; GFNIAVX512F-NEXT:    andl $61680, %edi # imm = 0xF0F0
+; GFNIAVX512F-NEXT:    shrl $4, %edi
+; GFNIAVX512F-NEXT:    orl %eax, %edi
+; GFNIAVX512F-NEXT:    movl %edi, %eax
+; GFNIAVX512F-NEXT:    andl $13107, %eax # imm = 0x3333
+; GFNIAVX512F-NEXT:    andl $52428, %edi # imm = 0xCCCC
+; GFNIAVX512F-NEXT:    shrl $2, %edi
+; GFNIAVX512F-NEXT:    leal (%rdi,%rax,4), %eax
+; GFNIAVX512F-NEXT:    movl %eax, %ecx
+; GFNIAVX512F-NEXT:    andl $21845, %ecx # imm = 0x5555
+; GFNIAVX512F-NEXT:    andl $43690, %eax # imm = 0xAAAA
+; GFNIAVX512F-NEXT:    shrl %eax
+; GFNIAVX512F-NEXT:    leal (%rax,%rcx,2), %eax
+; GFNIAVX512F-NEXT:    # kill: def $ax killed $ax killed $eax
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_i16:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNIAVX512BW-NEXT:    rolw $8, %di
+; GFNIAVX512BW-NEXT:    movl %edi, %eax
+; GFNIAVX512BW-NEXT:    andl $3855, %eax # imm = 0xF0F
+; GFNIAVX512BW-NEXT:    shll $4, %eax
+; GFNIAVX512BW-NEXT:    andl $61680, %edi # imm = 0xF0F0
+; GFNIAVX512BW-NEXT:    shrl $4, %edi
+; GFNIAVX512BW-NEXT:    orl %eax, %edi
+; GFNIAVX512BW-NEXT:    movl %edi, %eax
+; GFNIAVX512BW-NEXT:    andl $13107, %eax # imm = 0x3333
+; GFNIAVX512BW-NEXT:    andl $52428, %edi # imm = 0xCCCC
+; GFNIAVX512BW-NEXT:    shrl $2, %edi
+; GFNIAVX512BW-NEXT:    leal (%rdi,%rax,4), %eax
+; GFNIAVX512BW-NEXT:    movl %eax, %ecx
+; GFNIAVX512BW-NEXT:    andl $21845, %ecx # imm = 0x5555
+; GFNIAVX512BW-NEXT:    andl $43690, %eax # imm = 0xAAAA
+; GFNIAVX512BW-NEXT:    shrl %eax
+; GFNIAVX512BW-NEXT:    leal (%rax,%rcx,2), %eax
+; GFNIAVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; GFNIAVX512BW-NEXT:    retq
   %b = call i16 @llvm.bitreverse.i16(i16 %a)
   ret i16 %b
 }
@@ -170,6 +385,116 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    vmovd %xmm0, %eax
 ; XOP-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_i32:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNISSE-NEXT:    bswapl %edi
+; GFNISSE-NEXT:    movl %edi, %eax
+; GFNISSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; GFNISSE-NEXT:    shll $4, %eax
+; GFNISSE-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
+; GFNISSE-NEXT:    shrl $4, %edi
+; GFNISSE-NEXT:    orl %eax, %edi
+; GFNISSE-NEXT:    movl %edi, %eax
+; GFNISSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; GFNISSE-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
+; GFNISSE-NEXT:    shrl $2, %edi
+; GFNISSE-NEXT:    leal (%rdi,%rax,4), %eax
+; GFNISSE-NEXT:    movl %eax, %ecx
+; GFNISSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; GFNISSE-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; GFNISSE-NEXT:    shrl %eax
+; GFNISSE-NEXT:    leal (%rax,%rcx,2), %eax
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_i32:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNIAVX-NEXT:    bswapl %edi
+; GFNIAVX-NEXT:    movl %edi, %eax
+; GFNIAVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; GFNIAVX-NEXT:    shll $4, %eax
+; GFNIAVX-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
+; GFNIAVX-NEXT:    shrl $4, %edi
+; GFNIAVX-NEXT:    orl %eax, %edi
+; GFNIAVX-NEXT:    movl %edi, %eax
+; GFNIAVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; GFNIAVX-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
+; GFNIAVX-NEXT:    shrl $2, %edi
+; GFNIAVX-NEXT:    leal (%rdi,%rax,4), %eax
+; GFNIAVX-NEXT:    movl %eax, %ecx
+; GFNIAVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; GFNIAVX-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; GFNIAVX-NEXT:    shrl %eax
+; GFNIAVX-NEXT:    leal (%rax,%rcx,2), %eax
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_i32:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNIAVX2-NEXT:    bswapl %edi
+; GFNIAVX2-NEXT:    movl %edi, %eax
+; GFNIAVX2-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; GFNIAVX2-NEXT:    shll $4, %eax
+; GFNIAVX2-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
+; GFNIAVX2-NEXT:    shrl $4, %edi
+; GFNIAVX2-NEXT:    orl %eax, %edi
+; GFNIAVX2-NEXT:    movl %edi, %eax
+; GFNIAVX2-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; GFNIAVX2-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
+; GFNIAVX2-NEXT:    shrl $2, %edi
+; GFNIAVX2-NEXT:    leal (%rdi,%rax,4), %eax
+; GFNIAVX2-NEXT:    movl %eax, %ecx
+; GFNIAVX2-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; GFNIAVX2-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; GFNIAVX2-NEXT:    shrl %eax
+; GFNIAVX2-NEXT:    leal (%rax,%rcx,2), %eax
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_i32:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNIAVX512F-NEXT:    bswapl %edi
+; GFNIAVX512F-NEXT:    movl %edi, %eax
+; GFNIAVX512F-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; GFNIAVX512F-NEXT:    shll $4, %eax
+; GFNIAVX512F-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
+; GFNIAVX512F-NEXT:    shrl $4, %edi
+; GFNIAVX512F-NEXT:    orl %eax, %edi
+; GFNIAVX512F-NEXT:    movl %edi, %eax
+; GFNIAVX512F-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; GFNIAVX512F-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
+; GFNIAVX512F-NEXT:    shrl $2, %edi
+; GFNIAVX512F-NEXT:    leal (%rdi,%rax,4), %eax
+; GFNIAVX512F-NEXT:    movl %eax, %ecx
+; GFNIAVX512F-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; GFNIAVX512F-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; GFNIAVX512F-NEXT:    shrl %eax
+; GFNIAVX512F-NEXT:    leal (%rax,%rcx,2), %eax
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_i32:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    # kill: def $edi killed $edi def $rdi
+; GFNIAVX512BW-NEXT:    bswapl %edi
+; GFNIAVX512BW-NEXT:    movl %edi, %eax
+; GFNIAVX512BW-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; GFNIAVX512BW-NEXT:    shll $4, %eax
+; GFNIAVX512BW-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
+; GFNIAVX512BW-NEXT:    shrl $4, %edi
+; GFNIAVX512BW-NEXT:    orl %eax, %edi
+; GFNIAVX512BW-NEXT:    movl %edi, %eax
+; GFNIAVX512BW-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; GFNIAVX512BW-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
+; GFNIAVX512BW-NEXT:    shrl $2, %edi
+; GFNIAVX512BW-NEXT:    leal (%rdi,%rax,4), %eax
+; GFNIAVX512BW-NEXT:    movl %eax, %ecx
+; GFNIAVX512BW-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; GFNIAVX512BW-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; GFNIAVX512BW-NEXT:    shrl %eax
+; GFNIAVX512BW-NEXT:    leal (%rax,%rcx,2), %eax
+; GFNIAVX512BW-NEXT:    retq
   %b = call i32 @llvm.bitreverse.i32(i32 %a)
   ret i32 %b
 }
@@ -229,6 +554,126 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    vmovq %xmm0, %rax
 ; XOP-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_i64:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    bswapq %rdi
+; GFNISSE-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
+; GFNISSE-NEXT:    andq %rdi, %rax
+; GFNISSE-NEXT:    shlq $4, %rax
+; GFNISSE-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
+; GFNISSE-NEXT:    andq %rdi, %rcx
+; GFNISSE-NEXT:    shrq $4, %rcx
+; GFNISSE-NEXT:    orq %rax, %rcx
+; GFNISSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNISSE-NEXT:    andq %rcx, %rax
+; GFNISSE-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
+; GFNISSE-NEXT:    andq %rcx, %rdx
+; GFNISSE-NEXT:    shrq $2, %rdx
+; GFNISSE-NEXT:    leaq (%rdx,%rax,4), %rax
+; GFNISSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNISSE-NEXT:    andq %rax, %rcx
+; GFNISSE-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
+; GFNISSE-NEXT:    andq %rax, %rdx
+; GFNISSE-NEXT:    shrq %rdx
+; GFNISSE-NEXT:    leaq (%rdx,%rcx,2), %rax
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_i64:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    bswapq %rdi
+; GFNIAVX-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
+; GFNIAVX-NEXT:    andq %rdi, %rax
+; GFNIAVX-NEXT:    shlq $4, %rax
+; GFNIAVX-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
+; GFNIAVX-NEXT:    andq %rdi, %rcx
+; GFNIAVX-NEXT:    shrq $4, %rcx
+; GFNIAVX-NEXT:    orq %rax, %rcx
+; GFNIAVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX-NEXT:    andq %rcx, %rax
+; GFNIAVX-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
+; GFNIAVX-NEXT:    andq %rcx, %rdx
+; GFNIAVX-NEXT:    shrq $2, %rdx
+; GFNIAVX-NEXT:    leaq (%rdx,%rax,4), %rax
+; GFNIAVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX-NEXT:    andq %rax, %rcx
+; GFNIAVX-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
+; GFNIAVX-NEXT:    andq %rax, %rdx
+; GFNIAVX-NEXT:    shrq %rdx
+; GFNIAVX-NEXT:    leaq (%rdx,%rcx,2), %rax
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_i64:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    bswapq %rdi
+; GFNIAVX2-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
+; GFNIAVX2-NEXT:    andq %rdi, %rax
+; GFNIAVX2-NEXT:    shlq $4, %rax
+; GFNIAVX2-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
+; GFNIAVX2-NEXT:    andq %rdi, %rcx
+; GFNIAVX2-NEXT:    shrq $4, %rcx
+; GFNIAVX2-NEXT:    orq %rax, %rcx
+; GFNIAVX2-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX2-NEXT:    andq %rcx, %rax
+; GFNIAVX2-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
+; GFNIAVX2-NEXT:    andq %rcx, %rdx
+; GFNIAVX2-NEXT:    shrq $2, %rdx
+; GFNIAVX2-NEXT:    leaq (%rdx,%rax,4), %rax
+; GFNIAVX2-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX2-NEXT:    andq %rax, %rcx
+; GFNIAVX2-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
+; GFNIAVX2-NEXT:    andq %rax, %rdx
+; GFNIAVX2-NEXT:    shrq %rdx
+; GFNIAVX2-NEXT:    leaq (%rdx,%rcx,2), %rax
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_i64:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    bswapq %rdi
+; GFNIAVX512F-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
+; GFNIAVX512F-NEXT:    andq %rdi, %rax
+; GFNIAVX512F-NEXT:    shlq $4, %rax
+; GFNIAVX512F-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
+; GFNIAVX512F-NEXT:    andq %rdi, %rcx
+; GFNIAVX512F-NEXT:    shrq $4, %rcx
+; GFNIAVX512F-NEXT:    orq %rax, %rcx
+; GFNIAVX512F-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX512F-NEXT:    andq %rcx, %rax
+; GFNIAVX512F-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
+; GFNIAVX512F-NEXT:    andq %rcx, %rdx
+; GFNIAVX512F-NEXT:    shrq $2, %rdx
+; GFNIAVX512F-NEXT:    leaq (%rdx,%rax,4), %rax
+; GFNIAVX512F-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX512F-NEXT:    andq %rax, %rcx
+; GFNIAVX512F-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
+; GFNIAVX512F-NEXT:    andq %rax, %rdx
+; GFNIAVX512F-NEXT:    shrq %rdx
+; GFNIAVX512F-NEXT:    leaq (%rdx,%rcx,2), %rax
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_i64:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    bswapq %rdi
+; GFNIAVX512BW-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
+; GFNIAVX512BW-NEXT:    andq %rdi, %rax
+; GFNIAVX512BW-NEXT:    shlq $4, %rax
+; GFNIAVX512BW-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
+; GFNIAVX512BW-NEXT:    andq %rdi, %rcx
+; GFNIAVX512BW-NEXT:    shrq $4, %rcx
+; GFNIAVX512BW-NEXT:    orq %rax, %rcx
+; GFNIAVX512BW-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX512BW-NEXT:    andq %rcx, %rax
+; GFNIAVX512BW-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
+; GFNIAVX512BW-NEXT:    andq %rcx, %rdx
+; GFNIAVX512BW-NEXT:    shrq $2, %rdx
+; GFNIAVX512BW-NEXT:    leaq (%rdx,%rax,4), %rax
+; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT:    andq %rax, %rcx
+; GFNIAVX512BW-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
+; GFNIAVX512BW-NEXT:    andq %rax, %rdx
+; GFNIAVX512BW-NEXT:    shrq %rdx
+; GFNIAVX512BW-NEXT:    leaq (%rdx,%rcx,2), %rax
+; GFNIAVX512BW-NEXT:    retq
   %b = call i64 @llvm.bitreverse.i64(i64 %a)
   ret i64 %b
 }
@@ -288,6 +733,31 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_v16i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{.*}}(%rip), %xmm0
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_v16i8:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_v16i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_v16i8:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_v16i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX512BW-NEXT:    retq
   %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
   ret <16 x i8> %b
 }
@@ -353,6 +823,36 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_v8i16:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{.*}}(%rip), %xmm0
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_v8i16:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_v8i16:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_v8i16:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_v8i16:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX512BW-NEXT:    retq
   %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
   ret <8 x i16> %b
 }
@@ -423,6 +923,36 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_v4i32:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{.*}}(%rip), %xmm0
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_v4i32:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_v4i32:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_v4i32:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_v4i32:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX512BW-NEXT:    retq
   %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
   ret <4 x i32> %b
 }
@@ -495,6 +1025,36 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_v2i64:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{.*}}(%rip), %xmm0
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_v2i64:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_v2i64:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_v2i64:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_v2i64:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
+; GFNIAVX512BW-NEXT:    retq
   %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
   ret <2 x i64> %b
 }
@@ -634,6 +1194,40 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_v32i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_v32i8:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm2, %xmm1, %xmm1
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm2, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_v32i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_v32i8:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_v32i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    retq
   %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
   ret <32 x i8> %b
 }
@@ -789,6 +1383,49 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_v16i16:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_v16i16:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
+; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_v16i16:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_v16i16:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_v16i16:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    retq
   %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
   ret <16 x i16> %b
 }
@@ -953,6 +1590,49 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_v8i32:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_v8i32:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
+; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_v8i32:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_v8i32:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_v8i32:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    retq
   %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
   ret <8 x i32> %b
 }
@@ -1121,6 +1801,49 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_v4i64:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_v4i64:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
+; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_v4i64:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_v4i64:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_v4i64:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    retq
   %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
   ret <4 x i64> %b
 }
@@ -1366,6 +2089,49 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_v64i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_v64i8:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
+; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_v64i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_v64i8:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_v64i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    retq
   %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
   ret <64 x i8> %b
 }
@@ -1645,6 +2411,66 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_v32i16:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_v32i16:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
+; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
+; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
+; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_v32i16:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_v32i16:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
+; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
+; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_v32i16:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    retq
   %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
   ret <32 x i16> %b
 }
@@ -1941,6 +2767,66 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_v16i32:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_v16i32:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
+; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
+; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
+; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_v16i32:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_v16i32:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
+; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
+; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_v16i32:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    retq
   %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
   ret <16 x i32> %b
 }
@@ -2245,6 +3131,66 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    retq
+;
+; GFNISSE-LABEL: test_bitreverse_v8i64:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: test_bitreverse_v8i64:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
+; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
+; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
+; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_v8i64:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: test_bitreverse_v8i64:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
+; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
+; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: test_bitreverse_v8i64:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    retq
   %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
   ret <8 x i64> %b
 }
@@ -2277,6 +3223,31 @@ define <16 x i8> @fold_bitreverse_v16i8() nounwind {
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
 ; XOP-NEXT:    retq
+;
+; GFNISSE-LABEL: fold_bitreverse_v16i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: fold_bitreverse_v16i8:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: fold_bitreverse_v16i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: fold_bitreverse_v16i8:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: fold_bitreverse_v16i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
+; GFNIAVX512BW-NEXT:    retq
   %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>)
   ret <16 x i8> %b
 }
@@ -2297,6 +3268,32 @@ define <16 x i16> @fold_bitreverse_v16i16() nounwind {
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
 ; XOP-NEXT:    retq
+;
+; GFNISSE-LABEL: fold_bitreverse_v16i16:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
+; GFNISSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: fold_bitreverse_v16i16:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: fold_bitreverse_v16i16:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: fold_bitreverse_v16i16:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: fold_bitreverse_v16i16:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
+; GFNIAVX512BW-NEXT:    retq
   %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>)
   ret <16 x i16> %b
 }
@@ -2332,6 +3329,36 @@ define <16 x i32> @fold_bitreverse_v16i32() nounwind {
 ; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
 ; XOP-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
 ; XOP-NEXT:    retq
+;
+; GFNISSE-LABEL: fold_bitreverse_v16i32:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
+; GFNISSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
+; GFNISSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
+; GFNISSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: fold_bitreverse_v16i32:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
+; GFNIAVX-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
+; GFNIAVX-NEXT:    retq
+;
+; GFNIAVX2-LABEL: fold_bitreverse_v16i32:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
+; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512F-LABEL: fold_bitreverse_v16i32:
+; GFNIAVX512F:       # %bb.0:
+; GFNIAVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
+; GFNIAVX512F-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: fold_bitreverse_v16i32:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
+; GFNIAVX512BW-NEXT:    retq
   %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>)
   ret <16 x i32> %b
 }


        


More information about the llvm-commits mailing list