[llvm] [X86] LowerBITREVERSE - use AND+CMPEQ+MOVMSK trick to lower scalar types (PR #92236)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed May 15 02:56:27 PDT 2024


https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/92236

By splitting each byte of a scalar type into 8 copies, in reverse order, we can then test each one to see if the bit is set and then use PMOVMSKB/VPTESTMB to pack them back together to get the bit reversal.

So far I've only managed to make this worth it for i8/i16 with SSSE3 (PSHUFB), i32 with AVX2 (AVX1 would be worth it if we can force all 256-bit operations to be split), and i64 with AVX512BW (which can use VPTESTMB).

Fixes #79794

>From dd32f93696571166cad9f170df5a7b83735d7d23 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 15 May 2024 10:28:36 +0100
Subject: [PATCH] [X86] LowerBITREVERSE - use AND+CMPEQ+MOVMSK trick to lower
 i8/i16/i32 scalar types

By splitting each byte of a scalar type into 8 copies, in reverse order, we can then test each one to see if the bit is set and then use PMOVMSKB to pack them back together to get the bit reversal.

So far I've only managed to make this worth it for i8/i16 with SSSE3 (PSHUFB), i32 with AVX2 (AVX1 would be worth it if we can force all 256-bit operations to be split), and i64 with AVX512BW (which can use VPTESTMB).

Fixes #79794
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  51 ++-
 .../lib/Target/X86/X86TargetTransformInfo.cpp |   2 +
 .../CostModel/X86/bitreverse-codesize.ll      |  28 +-
 .../CostModel/X86/bitreverse-latency.ll       |  48 ++-
 .../CostModel/X86/bitreverse-sizelatency.ll   |  28 +-
 .../test/Analysis/CostModel/X86/bitreverse.ll |  48 ++-
 llvm/test/CodeGen/X86/vector-bitreverse.ll    | 407 ++++++++++++------
 7 files changed, 450 insertions(+), 162 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a57c10e784d9c..5d3d290365070 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1294,6 +1294,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::BITREVERSE,       VT, Custom);
       setOperationAction(ISD::CTLZ,             VT, Custom);
     }
+    setOperationAction(ISD::BITREVERSE,         MVT::i8,    Custom);
+    setOperationAction(ISD::BITREVERSE,         MVT::i16,   Custom);
 
     // These might be better off as horizontal vector ops.
     setOperationAction(ISD::ADD,                MVT::i16, Custom);
@@ -1520,6 +1522,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
+    // TODO: Support AVX1 once i32 bitreverse codegen is better.
+    if (Subtarget.hasInt256())
+      setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
+
     setOperationAction(ISD::SETCC,          MVT::v4f64, Custom);
     setOperationAction(ISD::SETCC,          MVT::v8f32, Custom);
     setOperationAction(ISD::STRICT_FSETCC,  MVT::v4f64, Custom);
@@ -1916,6 +1922,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
+    if (HasBWI)
+      setOperationAction(ISD::BITREVERSE,   MVT::i64, Custom);
+
     setOperationAction(ISD::SETCC,          MVT::v8f64, Custom);
     setOperationAction(ISD::SETCC,          MVT::v16f32, Custom);
     setOperationAction(ISD::STRICT_FSETCC,  MVT::v8f64, Custom);
@@ -31439,11 +31448,51 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return splitVectorIntUnary(Op, DAG, DL);
 
-  // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
   if (!VT.isVector()) {
     assert(
         (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
         "Only tested for i8/i16/i32/i64");
+
+    // Lower i8/i16/i32 with vXi8 AND+CMPEQ+MOVMSK trick.
+    // Broadcast each byte across 8 bytes in byteswaped order, test if each bit
+    // is set and use MOVMSK/VPTESTMB to pack the bit results together.
+    if (!Subtarget.hasGFNI()) {
+      assert((VT == MVT::i8 || VT == MVT::i16 ||
+              (VT == MVT::i32 && Subtarget.hasInt256()) ||
+              (VT == MVT::i64 && Subtarget.hasBWI())) &&
+             "Unsupported scalar bitreverse");
+      unsigned VecBits = std::max<unsigned>(VT.getSizeInBits() * 8, 128);
+      unsigned NumBytes = VecBits / 8;
+      MVT ConstVT = MVT::getVectorVT(MVT::i64, VecBits / 64);
+      MVT ByteVT = MVT::getVectorVT(MVT::i8, VecBits / 8);
+      MVT SrcVT = MVT::getVectorVT(VT, VecBits / VT.getScalarSizeInBits());
+      SDValue Splat = DAG.getBitcast(ByteVT, DAG.getSplat(SrcVT, DL, In));
+
+      // Repeat each byte 8 times in reverse order.
+      SmallVector<int, 32> ByteSplatMask;
+      for (unsigned I = 0, E = VT.getSizeInBits() / 8; I != E; ++I)
+        ByteSplatMask.append(8, (E - 1) - I);
+      ByteSplatMask.append(NumBytes - ByteSplatMask.size(), -1);
+      SDValue Bytes =
+          DAG.getVectorShuffle(ByteVT, DL, Splat, Splat, ByteSplatMask);
+
+      SDValue Mask = DAG.getBitcast(
+          ByteVT, DAG.getConstant(0x102040810204080ULL, DL, ConstVT));
+      SDValue And = DAG.getNode(ISD::AND, DL, ByteVT, Bytes, Mask);
+      if (Subtarget.hasBWI() && (VT == MVT::i64 || Subtarget.hasVLX())) {
+        MVT CmpVT = MVT::getVectorVT(MVT::i1, NumBytes);
+        SDValue Zero = DAG.getConstant(0, DL, ByteVT);
+        SDValue Cmp = DAG.getSetCC(DL, CmpVT, And, Zero, ISD::CondCode::SETNE);
+        SDValue Msk = DAG.getBitcast(MVT::getIntegerVT(NumBytes), Cmp);
+        return DAG.getZExtOrTrunc(Msk, DL, VT);
+      } else {
+        SDValue Cmp = DAG.getSetCC(DL, ByteVT, And, Mask, ISD::CondCode::SETEQ);
+        SDValue Msk = getPMOVMSKB(DL, Cmp, DAG, Subtarget);
+        return DAG.getZExtOrTrunc(Msk, DL, VT);
+      }
+    }
+
+    // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
     Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index ac66144aeaaec..252b7a7dcbc83 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3959,6 +3959,8 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::BITREVERSE, MVT::v4i32,   { 16, 20, 11, 21 } },
     { ISD::BITREVERSE, MVT::v8i16,   { 16, 20, 11, 21 } },
     { ISD::BITREVERSE, MVT::v16i8,   { 11, 12, 10, 16 } },
+    { ISD::BITREVERSE, MVT::i16,     {  5,  6,  6, 10 } }, // AND+PCMPEQB+PMOVMSKB
+    { ISD::BITREVERSE, MVT::i8,      {  4,  4,  7,  8 } }, // AND+PCMPEQB+PMOVMSKB
     { ISD::BSWAP,      MVT::v2i64,   {  2,  3,  1,  5 } },
     { ISD::BSWAP,      MVT::v4i32,   {  2,  3,  1,  5 } },
     { ISD::BSWAP,      MVT::v8i16,   {  2,  3,  1,  5 } },
diff --git a/llvm/test/Analysis/CostModel/X86/bitreverse-codesize.ll b/llvm/test/Analysis/CostModel/X86/bitreverse-codesize.ll
index e02ba761aa8d2..59c5afa86c82c 100644
--- a/llvm/test/Analysis/CostModel/X86/bitreverse-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/bitreverse-codesize.ll
@@ -101,13 +101,25 @@ define i32 @var_bitreverse_i32(i32 %a) {
 }
 
 define i16 @var_bitreverse_i16(i16 %a) {
-; X86-LABEL: 'var_bitreverse_i16'
-; X86-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
-; X86-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
+; SSE2-LABEL: 'var_bitreverse_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
-; X64-LABEL: 'var_bitreverse_i16'
-; X64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
-; X64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
+; SSE42-LABEL: 'var_bitreverse_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
+;
+; AVX1-LABEL: 'var_bitreverse_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
+;
+; AVX2-LABEL: 'var_bitreverse_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
+;
+; AVX512-LABEL: 'var_bitreverse_i16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_i16'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
@@ -139,11 +151,11 @@ define i16 @var_bitreverse_i16(i16 %a) {
 
 define i8 @var_bitreverse_i8(i8 %a) {
 ; X86-LABEL: 'var_bitreverse_i8'
-; X86-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; X86-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; X86-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; X64-LABEL: 'var_bitreverse_i8'
-; X64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; X64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; X64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/bitreverse-latency.ll b/llvm/test/Analysis/CostModel/X86/bitreverse-latency.ll
index ba231b985c26b..a2333dd373c60 100644
--- a/llvm/test/Analysis/CostModel/X86/bitreverse-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/bitreverse-latency.ll
@@ -101,13 +101,25 @@ define i32 @var_bitreverse_i32(i32 %a) {
 }
 
 define i16 @var_bitreverse_i16(i16 %a) {
-; X86-LABEL: 'var_bitreverse_i16'
-; X86-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
-; X86-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
+; SSE2-LABEL: 'var_bitreverse_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
-; X64-LABEL: 'var_bitreverse_i16'
-; X64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
-; X64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
+; SSE42-LABEL: 'var_bitreverse_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
+;
+; AVX1-LABEL: 'var_bitreverse_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
+;
+; AVX2-LABEL: 'var_bitreverse_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
+;
+; AVX512-LABEL: 'var_bitreverse_i16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_i16'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
@@ -138,13 +150,25 @@ define i16 @var_bitreverse_i16(i16 %a) {
 }
 
 define i8 @var_bitreverse_i8(i8 %a) {
-; X86-LABEL: 'var_bitreverse_i8'
-; X86-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
-; X86-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
+; SSE2-LABEL: 'var_bitreverse_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
+;
+; SSE42-LABEL: 'var_bitreverse_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
+;
+; AVX1-LABEL: 'var_bitreverse_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
+;
+; AVX2-LABEL: 'var_bitreverse_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
-; X64-LABEL: 'var_bitreverse_i8'
-; X64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
-; X64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
+; AVX512-LABEL: 'var_bitreverse_i8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_i8'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
diff --git a/llvm/test/Analysis/CostModel/X86/bitreverse-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/bitreverse-sizelatency.ll
index d60fac228fc06..833d1f88017ff 100644
--- a/llvm/test/Analysis/CostModel/X86/bitreverse-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/bitreverse-sizelatency.ll
@@ -102,11 +102,11 @@ define i32 @var_bitreverse_i32(i32 %a) {
 
 define i16 @var_bitreverse_i16(i16 %a) {
 ; X86-LABEL: 'var_bitreverse_i16'
-; X86-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; X86-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; X86-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; X64-LABEL: 'var_bitreverse_i16'
-; X64-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; X64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; X64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_i16'
@@ -138,13 +138,25 @@ define i16 @var_bitreverse_i16(i16 %a) {
 }
 
 define i8 @var_bitreverse_i8(i8 %a) {
-; X86-LABEL: 'var_bitreverse_i8'
-; X86-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
-; X86-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
+; SSE2-LABEL: 'var_bitreverse_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
-; X64-LABEL: 'var_bitreverse_i8'
-; X64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
-; X64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
+; SSE42-LABEL: 'var_bitreverse_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
+;
+; AVX1-LABEL: 'var_bitreverse_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
+;
+; AVX2-LABEL: 'var_bitreverse_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
+;
+; AVX512-LABEL: 'var_bitreverse_i8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_i8'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
diff --git a/llvm/test/Analysis/CostModel/X86/bitreverse.ll b/llvm/test/Analysis/CostModel/X86/bitreverse.ll
index a890147fee465..41a42f6062a0c 100644
--- a/llvm/test/Analysis/CostModel/X86/bitreverse.ll
+++ b/llvm/test/Analysis/CostModel/X86/bitreverse.ll
@@ -101,13 +101,25 @@ define i32 @var_bitreverse_i32(i32 %a) {
 }
 
 define i16 @var_bitreverse_i16(i16 %a) {
-; X86-LABEL: 'var_bitreverse_i16'
-; X86-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
-; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
+; SSE2-LABEL: 'var_bitreverse_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
 ;
-; X64-LABEL: 'var_bitreverse_i16'
-; X64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
-; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
+; SSE42-LABEL: 'var_bitreverse_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
+;
+; AVX1-LABEL: 'var_bitreverse_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
+;
+; AVX2-LABEL: 'var_bitreverse_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
+;
+; AVX512-LABEL: 'var_bitreverse_i16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_i16'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
@@ -138,13 +150,25 @@ define i16 @var_bitreverse_i16(i16 %a) {
 }
 
 define i8 @var_bitreverse_i8(i8 %a) {
-; X86-LABEL: 'var_bitreverse_i8'
-; X86-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
-; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
+; SSE2-LABEL: 'var_bitreverse_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
+;
+; SSE42-LABEL: 'var_bitreverse_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
+;
+; AVX1-LABEL: 'var_bitreverse_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
+;
+; AVX2-LABEL: 'var_bitreverse_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
 ;
-; X64-LABEL: 'var_bitreverse_i8'
-; X64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
-; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
+; AVX512-LABEL: 'var_bitreverse_i8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_i8'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index 90cc3d5fdde82..74adb96446829 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE
@@ -17,39 +17,78 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw
 
 define i8 @test_bitreverse_i8(i8 %a) nounwind {
-; SSE-LABEL: test_bitreverse_i8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    rolb $4, %dil
-; SSE-NEXT:    movl %edi, %eax
-; SSE-NEXT:    andb $51, %al
-; SSE-NEXT:    shlb $2, %al
-; SSE-NEXT:    shrb $2, %dil
-; SSE-NEXT:    andb $51, %dil
-; SSE-NEXT:    orb %dil, %al
-; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:    andb $85, %cl
-; SSE-NEXT:    addb %cl, %cl
-; SSE-NEXT:    shrb %al
-; SSE-NEXT:    andb $85, %al
-; SSE-NEXT:    orb %cl, %al
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_bitreverse_i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    rolb $4, %dil
+; SSE2-NEXT:    movl %edi, %eax
+; SSE2-NEXT:    andb $51, %al
+; SSE2-NEXT:    shlb $2, %al
+; SSE2-NEXT:    shrb $2, %dil
+; SSE2-NEXT:    andb $51, %dil
+; SSE2-NEXT:    orb %dil, %al
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    andb $85, %cl
+; SSE2-NEXT:    addb %cl, %cl
+; SSE2-NEXT:    shrb %al
+; SSE2-NEXT:    andb $85, %al
+; SSE2-NEXT:    orb %cl, %al
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: test_bitreverse_i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    rolb $4, %dil
-; AVX-NEXT:    movl %edi, %eax
-; AVX-NEXT:    andb $51, %al
-; AVX-NEXT:    shlb $2, %al
-; AVX-NEXT:    shrb $2, %dil
-; AVX-NEXT:    andb $51, %dil
-; AVX-NEXT:    orb %dil, %al
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    andb $85, %cl
-; AVX-NEXT:    addb %cl, %cl
-; AVX-NEXT:    shrb %al
-; AVX-NEXT:    andb $85, %al
-; AVX-NEXT:    orb %cl, %al
-; AVX-NEXT:    retq
+; SSSE3-LABEL: test_bitreverse_i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [72624976668147840,72624976668147840]
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSSE3-NEXT:    pmovmskb %xmm0, %eax
+; SSSE3-NEXT:    # kill: def $al killed $al killed $eax
+; SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: test_bitreverse_i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd %edi, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [72624976668147840,72624976668147840]
+; AVX1-NEXT:    # xmm1 = mem[0,0]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_bitreverse_i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [72624976668147840,72624976668147840]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_bitreverse_i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovd %edi, %xmm0
+; AVX512F-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [72624976668147840,72624976668147840]
+; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovmskb %xmm0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: test_bitreverse_i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %edi, %xmm0
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [72624976668147840,72624976668147840]
+; AVX512BW-NEXT:    vptestmb %xmm1, %xmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    retq
 ;
 ; XOP-LABEL: test_bitreverse_i8:
 ; XOP:       # %bb.0:
@@ -79,51 +118,85 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 }
 
 define i16 @test_bitreverse_i16(i16 %a) nounwind {
-; SSE-LABEL: test_bitreverse_i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
-; SSE-NEXT:    rolw $8, %di
-; SSE-NEXT:    movl %edi, %eax
-; SSE-NEXT:    andl $3855, %eax # imm = 0xF0F
-; SSE-NEXT:    shll $4, %eax
-; SSE-NEXT:    shrl $4, %edi
-; SSE-NEXT:    andl $3855, %edi # imm = 0xF0F
-; SSE-NEXT:    orl %eax, %edi
-; SSE-NEXT:    movl %edi, %eax
-; SSE-NEXT:    andl $13107, %eax # imm = 0x3333
-; SSE-NEXT:    shrl $2, %edi
-; SSE-NEXT:    andl $13107, %edi # imm = 0x3333
-; SSE-NEXT:    leal (%rdi,%rax,4), %eax
-; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:    andl $21845, %ecx # imm = 0x5555
-; SSE-NEXT:    shrl %eax
-; SSE-NEXT:    andl $21845, %eax # imm = 0x5555
-; SSE-NEXT:    leal (%rax,%rcx,2), %eax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_bitreverse_i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
+; SSE2-NEXT:    rolw $8, %di
+; SSE2-NEXT:    movl %edi, %eax
+; SSE2-NEXT:    andl $3855, %eax # imm = 0xF0F
+; SSE2-NEXT:    shll $4, %eax
+; SSE2-NEXT:    shrl $4, %edi
+; SSE2-NEXT:    andl $3855, %edi # imm = 0xF0F
+; SSE2-NEXT:    orl %eax, %edi
+; SSE2-NEXT:    movl %edi, %eax
+; SSE2-NEXT:    andl $13107, %eax # imm = 0x3333
+; SSE2-NEXT:    shrl $2, %edi
+; SSE2-NEXT:    andl $13107, %edi # imm = 0x3333
+; SSE2-NEXT:    leal (%rdi,%rax,4), %eax
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    andl $21845, %ecx # imm = 0x5555
+; SSE2-NEXT:    shrl %eax
+; SSE2-NEXT:    andl $21845, %eax # imm = 0x5555
+; SSE2-NEXT:    leal (%rax,%rcx,2), %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: test_bitreverse_i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
-; AVX-NEXT:    rolw $8, %di
-; AVX-NEXT:    movl %edi, %eax
-; AVX-NEXT:    andl $3855, %eax # imm = 0xF0F
-; AVX-NEXT:    shll $4, %eax
-; AVX-NEXT:    shrl $4, %edi
-; AVX-NEXT:    andl $3855, %edi # imm = 0xF0F
-; AVX-NEXT:    orl %eax, %edi
-; AVX-NEXT:    movl %edi, %eax
-; AVX-NEXT:    andl $13107, %eax # imm = 0x3333
-; AVX-NEXT:    shrl $2, %edi
-; AVX-NEXT:    andl $13107, %edi # imm = 0x3333
-; AVX-NEXT:    leal (%rdi,%rax,4), %eax
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    andl $21845, %ecx # imm = 0x5555
-; AVX-NEXT:    shrl %eax
-; AVX-NEXT:    andl $21845, %eax # imm = 0x5555
-; AVX-NEXT:    leal (%rax,%rcx,2), %eax
-; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX-NEXT:    retq
+; SSSE3-LABEL: test_bitreverse_i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [72624976668147840,72624976668147840]
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSSE3-NEXT:    pmovmskb %xmm0, %eax
+; SSSE3-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: test_bitreverse_i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd %edi, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [72624976668147840,72624976668147840]
+; AVX1-NEXT:    # xmm1 = mem[0,0]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_bitreverse_i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [72624976668147840,72624976668147840]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_bitreverse_i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovd %edi, %xmm0
+; AVX512F-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [72624976668147840,72624976668147840]
+; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovmskb %xmm0, %eax
+; AVX512F-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: test_bitreverse_i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastw %edi, %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [72624976668147840,72624976668147840]
+; AVX512BW-NEXT:    vptestmb %xmm1, %xmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    retq
 ;
 ; XOP-LABEL: test_bitreverse_i16:
 ; XOP:       # %bb.0:
@@ -177,27 +250,61 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; SSE-NEXT:    leal (%rax,%rcx,2), %eax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_bitreverse_i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
-; AVX-NEXT:    bswapl %edi
-; AVX-NEXT:    movl %edi, %eax
-; AVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; AVX-NEXT:    shll $4, %eax
-; AVX-NEXT:    shrl $4, %edi
-; AVX-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; AVX-NEXT:    orl %eax, %edi
-; AVX-NEXT:    movl %edi, %eax
-; AVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; AVX-NEXT:    shrl $2, %edi
-; AVX-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; AVX-NEXT:    leal (%rdi,%rax,4), %eax
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; AVX-NEXT:    shrl %eax
-; AVX-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; AVX-NEXT:    leal (%rax,%rcx,2), %eax
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_bitreverse_i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    # kill: def $edi killed $edi def $rdi
+; AVX1-NEXT:    bswapl %edi
+; AVX1-NEXT:    movl %edi, %eax
+; AVX1-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; AVX1-NEXT:    shll $4, %eax
+; AVX1-NEXT:    shrl $4, %edi
+; AVX1-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; AVX1-NEXT:    orl %eax, %edi
+; AVX1-NEXT:    movl %edi, %eax
+; AVX1-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; AVX1-NEXT:    shrl $2, %edi
+; AVX1-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; AVX1-NEXT:    leal (%rdi,%rax,4), %eax
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; AVX1-NEXT:    shrl %eax
+; AVX1-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; AVX1-NEXT:    leal (%rax,%rcx,2), %eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_bitreverse_i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,17,17,17,17,17,17,17,17,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [72624976668147840,72624976668147840,72624976668147840,72624976668147840]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_bitreverse_i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovd %edi, %xmm0
+; AVX512F-NEXT:    vpbroadcastd %xmm0, %ymm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,17,17,17,17,17,17,17,17,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [72624976668147840,72624976668147840,72624976668147840,72624976668147840]
+; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovmskb %ymm0, %eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: test_bitreverse_i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastd %edi, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,17,17,17,17,17,17,17,17,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [72624976668147840,72624976668147840,72624976668147840,72624976668147840]
+; AVX512BW-NEXT:    vptestmb %ymm1, %ymm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 ;
 ; XOP-LABEL: test_bitreverse_i32:
 ; XOP:       # %bb.0:
@@ -250,29 +357,87 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; SSE-NEXT:    leaq (%rax,%rdx,2), %rax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_bitreverse_i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    bswapq %rdi
-; AVX-NEXT:    movq %rdi, %rax
-; AVX-NEXT:    shrq $4, %rax
-; AVX-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; AVX-NEXT:    andq %rcx, %rax
-; AVX-NEXT:    andq %rcx, %rdi
-; AVX-NEXT:    shlq $4, %rdi
-; AVX-NEXT:    orq %rax, %rdi
-; AVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; AVX-NEXT:    movq %rdi, %rcx
-; AVX-NEXT:    andq %rax, %rcx
-; AVX-NEXT:    shrq $2, %rdi
-; AVX-NEXT:    andq %rax, %rdi
-; AVX-NEXT:    leaq (%rdi,%rcx,4), %rax
-; AVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; AVX-NEXT:    movq %rax, %rdx
-; AVX-NEXT:    andq %rcx, %rdx
-; AVX-NEXT:    shrq %rax
-; AVX-NEXT:    andq %rcx, %rax
-; AVX-NEXT:    leaq (%rax,%rdx,2), %rax
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_bitreverse_i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    bswapq %rdi
+; AVX1-NEXT:    movq %rdi, %rax
+; AVX1-NEXT:    shrq $4, %rax
+; AVX1-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
+; AVX1-NEXT:    andq %rcx, %rax
+; AVX1-NEXT:    andq %rcx, %rdi
+; AVX1-NEXT:    shlq $4, %rdi
+; AVX1-NEXT:    orq %rax, %rdi
+; AVX1-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; AVX1-NEXT:    movq %rdi, %rcx
+; AVX1-NEXT:    andq %rax, %rcx
+; AVX1-NEXT:    shrq $2, %rdi
+; AVX1-NEXT:    andq %rax, %rdi
+; AVX1-NEXT:    leaq (%rdi,%rcx,4), %rax
+; AVX1-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    andq %rcx, %rdx
+; AVX1-NEXT:    shrq %rax
+; AVX1-NEXT:    andq %rcx, %rax
+; AVX1-NEXT:    leaq (%rax,%rdx,2), %rax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_bitreverse_i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    bswapq %rdi
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    shrq $4, %rax
+; AVX2-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
+; AVX2-NEXT:    andq %rcx, %rax
+; AVX2-NEXT:    andq %rcx, %rdi
+; AVX2-NEXT:    shlq $4, %rdi
+; AVX2-NEXT:    orq %rax, %rdi
+; AVX2-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; AVX2-NEXT:    movq %rdi, %rcx
+; AVX2-NEXT:    andq %rax, %rcx
+; AVX2-NEXT:    shrq $2, %rdi
+; AVX2-NEXT:    andq %rax, %rdi
+; AVX2-NEXT:    leaq (%rdi,%rcx,4), %rax
+; AVX2-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    andq %rcx, %rdx
+; AVX2-NEXT:    shrq %rax
+; AVX2-NEXT:    andq %rcx, %rax
+; AVX2-NEXT:    leaq (%rax,%rdx,2), %rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_bitreverse_i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    bswapq %rdi
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    shrq $4, %rax
+; AVX512F-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
+; AVX512F-NEXT:    andq %rcx, %rax
+; AVX512F-NEXT:    andq %rcx, %rdi
+; AVX512F-NEXT:    shlq $4, %rdi
+; AVX512F-NEXT:    orq %rax, %rdi
+; AVX512F-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; AVX512F-NEXT:    movq %rdi, %rcx
+; AVX512F-NEXT:    andq %rax, %rcx
+; AVX512F-NEXT:    shrq $2, %rdi
+; AVX512F-NEXT:    andq %rax, %rdi
+; AVX512F-NEXT:    leaq (%rdi,%rcx,4), %rax
+; AVX512F-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; AVX512F-NEXT:    movq %rax, %rdx
+; AVX512F-NEXT:    andq %rcx, %rdx
+; AVX512F-NEXT:    shrq %rax
+; AVX512F-NEXT:    andq %rcx, %rax
+; AVX512F-NEXT:    leaq (%rax,%rdx,2), %rax
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: test_bitreverse_i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastq %rdi, %zmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,21,21,21,21,21,21,21,21,20,20,20,20,20,20,20,20,35,35,35,35,35,35,35,35,34,34,34,34,34,34,34,34,49,49,49,49,49,49,49,49,48,48,48,48,48,48,48,48]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [72624976668147840,72624976668147840,72624976668147840,72624976668147840,72624976668147840,72624976668147840,72624976668147840,72624976668147840]
+; AVX512BW-NEXT:    vptestmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 ;
 ; XOP-LABEL: test_bitreverse_i64:
 ; XOP:       # %bb.0:



More information about the llvm-commits mailing list