[llvm] 8bb2428 - [SelectionDAG] Optimize bitreverse expansion to minimize the number of mask constants.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 26 09:33:41 PDT 2021


Author: Craig Topper
Date: 2021-08-26T09:33:24-07:00
New Revision: 8bb24289f3ac2bcf36d44d4951dc1a5e6822ae7b

URL: https://github.com/llvm/llvm-project/commit/8bb24289f3ac2bcf36d44d4951dc1a5e6822ae7b
DIFF: https://github.com/llvm/llvm-project/commit/8bb24289f3ac2bcf36d44d4951dc1a5e6822ae7b.diff

LOG: [SelectionDAG] Optimize bitreverse expansion to minimize the number of mask constants.

We can halve the number of mask constants by masking before shl
and after srl.

This can reduce the number of mov immediate or constant
materializations. Or reduce the number of constant pool loads
for X86 vectors.

I think we might be able to do something similar for bswap. I'll
look at it next.

Differential Revision: https://reviews.llvm.org/D108738

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/RISCV/rv32zbp.ll
    llvm/test/CodeGen/RISCV/rv64zbp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
    llvm/test/CodeGen/X86/bitreverse.ll
    llvm/test/CodeGen/X86/combine-bitreverse.ll
    llvm/test/CodeGen/X86/pr43820.ll
    llvm/test/CodeGen/X86/vector-bitreverse.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index accda2588c883..0cd15de4d6413 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7296,34 +7296,31 @@ SDValue TargetLowering::expandBITREVERSE(SDNode *N, SelectionDAG &DAG) const {
   // TODO: We can easily support i4/i2 legal types if any target ever does.
   if (Sz >= 8 && isPowerOf2_32(Sz)) {
     // Create the masks - repeating the pattern every byte.
-    APInt MaskHi4 = APInt::getSplat(Sz, APInt(8, 0xF0));
-    APInt MaskHi2 = APInt::getSplat(Sz, APInt(8, 0xCC));
-    APInt MaskHi1 = APInt::getSplat(Sz, APInt(8, 0xAA));
-    APInt MaskLo4 = APInt::getSplat(Sz, APInt(8, 0x0F));
-    APInt MaskLo2 = APInt::getSplat(Sz, APInt(8, 0x33));
-    APInt MaskLo1 = APInt::getSplat(Sz, APInt(8, 0x55));
+    APInt Mask4 = APInt::getSplat(Sz, APInt(8, 0x0F));
+    APInt Mask2 = APInt::getSplat(Sz, APInt(8, 0x33));
+    APInt Mask1 = APInt::getSplat(Sz, APInt(8, 0x55));
 
     // BSWAP if the type is wider than a single byte.
     Tmp = (Sz > 8 ? DAG.getNode(ISD::BSWAP, dl, VT, Op) : Op);
 
-    // swap i4: ((V & 0xF0) >> 4) | ((V & 0x0F) << 4)
-    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi4, dl, VT));
-    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo4, dl, VT));
-    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(4, dl, SHVT));
+    // swap i4: ((V >> 4) & 0x0F) | ((V & 0x0F) << 4)
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(4, dl, SHVT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask4, dl, VT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask4, dl, VT));
     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, SHVT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
 
-    // swap i2: ((V & 0xCC) >> 2) | ((V & 0x33) << 2)
-    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi2, dl, VT));
-    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo2, dl, VT));
-    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(2, dl, SHVT));
+    // swap i2: ((V >> 2) & 0x33) | ((V & 0x33) << 2)
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(2, dl, SHVT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask2, dl, VT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask2, dl, VT));
     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, SHVT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
 
-    // swap i1: ((V & 0xAA) >> 1) | ((V & 0x55) << 1)
-    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi1, dl, VT));
-    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo1, dl, VT));
-    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(1, dl, SHVT));
+    // swap i1: ((V >> 1) & 0x55) | ((V & 0x55) << 1)
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(1, dl, SHVT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask1, dl, VT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask1, dl, VT));
     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, SHVT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
     return Tmp;

diff  --git a/llvm/test/CodeGen/RISCV/rv32zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbp.ll
index cfad9fb9110a4..1717526a608cd 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbp.ll
@@ -2453,13 +2453,13 @@ define zeroext i8 @bitreverse_i8(i8 zeroext %a) nounwind {
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    andi a1, a0, 51
 ; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    andi a0, a0, 204
 ; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    andi a1, a0, 85
 ; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    andi a0, a0, 170
 ; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    andi a0, a0, 85
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
@@ -2484,33 +2484,27 @@ define zeroext i16 @bitreverse_i16(i16 zeroext %a) nounwind {
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    slli a0, a0, 8
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    lui a1, 1
-; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    and a1, a0, a1
-; RV32I-NEXT:    slli a1, a1, 4
-; RV32I-NEXT:    lui a2, 15
-; RV32I-NEXT:    addi a2, a2, 240
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    lui a2, 1
+; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    srli a0, a0, 4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    lui a1, 3
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a1, a0, a1
-; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    lui a2, 13
-; RV32I-NEXT:    addi a2, a2, -820
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 2
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    lui a1, 5
-; RV32I-NEXT:    addi a1, a1, 1365
-; RV32I-NEXT:    and a1, a0, a1
-; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    lui a2, 11
-; RV32I-NEXT:    addi a2, a2, -1366
+; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    addi a2, a2, 1365
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32B-LABEL: bitreverse_i16:
@@ -2543,33 +2537,27 @@ define i32 @bitreverse_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    and a1, a0, a1
-; RV32I-NEXT:    slli a1, a1, 4
-; RV32I-NEXT:    lui a2, 986895
-; RV32I-NEXT:    addi a2, a2, 240
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    srli a0, a0, 4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a1, a0, a1
-; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    lui a2, 838861
-; RV32I-NEXT:    addi a2, a2, -820
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    lui a1, 349525
-; RV32I-NEXT:    addi a1, a1, 1365
-; RV32I-NEXT:    and a1, a0, a1
-; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    lui a2, 699051
-; RV32I-NEXT:    addi a2, a2, -1366
+; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi a2, a2, 1365
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32B-LABEL: bitreverse_i32:
@@ -2602,58 +2590,52 @@ define i64 @bitreverse_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    slli a1, a1, 24
 ; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi t0, a2, -241
-; RV32I-NEXT:    and a2, a1, t0
-; RV32I-NEXT:    slli a2, a2, 4
-; RV32I-NEXT:    lui a3, 986895
-; RV32I-NEXT:    addi t1, a3, 240
-; RV32I-NEXT:    and a1, a1, t1
-; RV32I-NEXT:    srli a1, a1, 4
-; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi t2, a2, 819
-; RV32I-NEXT:    and a2, a1, t2
-; RV32I-NEXT:    slli a2, a2, 2
-; RV32I-NEXT:    lui a4, 838861
-; RV32I-NEXT:    addi t3, a4, -820
-; RV32I-NEXT:    and a1, a1, t3
-; RV32I-NEXT:    srli a1, a1, 2
-; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a3, a2, 1365
-; RV32I-NEXT:    and a2, a1, a3
-; RV32I-NEXT:    slli a2, a2, 1
-; RV32I-NEXT:    lui a5, 699051
-; RV32I-NEXT:    addi a5, a5, -1366
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    lui a4, 61681
+; RV32I-NEXT:    addi a4, a4, -241
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    slli a1, a1, 4
+; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    srli a2, a1, 2
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    srli a2, a1, 1
+; RV32I-NEXT:    lui a5, 349525
+; RV32I-NEXT:    addi a5, a5, 1365
+; RV32I-NEXT:    and a2, a2, a5
 ; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    srli a1, a1, 1
-; RV32I-NEXT:    or a2, a1, a2
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    or t0, a2, a1
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    and a1, a1, a6
-; RV32I-NEXT:    srli a4, a0, 24
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    slli a4, a0, 8
-; RV32I-NEXT:    and a4, a4, a7
+; RV32I-NEXT:    srli a2, a0, 24
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    and a2, a2, a7
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    and a1, a0, t0
-; RV32I-NEXT:    slli a1, a1, 4
-; RV32I-NEXT:    and a0, a0, t1
-; RV32I-NEXT:    srli a0, a0, 4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    and a1, a0, t2
-; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    and a0, a0, t3
-; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    and a1, a0, a3
-; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 2
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    and a1, a1, a5
 ; RV32I-NEXT:    and a0, a0, a5
-; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    or a1, a1, a0
+; RV32I-NEXT:    mv a0, t0
 ; RV32I-NEXT:    ret
 ;
 ; RV32B-LABEL: bitreverse_i64:
@@ -2756,33 +2738,27 @@ define i32 @bitreverse_bswap_i32(i32 %a) {
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    and a1, a0, a1
-; RV32I-NEXT:    slli a1, a1, 4
-; RV32I-NEXT:    lui a3, 986895
-; RV32I-NEXT:    addi a3, a3, 240
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    lui a3, 61681
+; RV32I-NEXT:    addi a3, a3, -241
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    and a0, a0, a3
-; RV32I-NEXT:    srli a0, a0, 4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a1, a0, a1
-; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    lui a3, 838861
-; RV32I-NEXT:    addi a3, a3, -820
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 2
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    and a0, a0, a3
-; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    lui a1, 349525
-; RV32I-NEXT:    addi a1, a1, 1365
-; RV32I-NEXT:    and a1, a0, a1
-; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    lui a3, 699051
-; RV32I-NEXT:    addi a3, a3, -1366
+; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    addi a3, a3, 1365
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    and a0, a0, a3
-; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a2, a0, 24
@@ -2813,82 +2789,76 @@ define i64 @bitreverse_bswap_i64(i64 %a) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a3, a1, 8
 ; RV32I-NEXT:    lui a2, 16
-; RV32I-NEXT:    addi t0, a2, -256
-; RV32I-NEXT:    and a3, a3, t0
+; RV32I-NEXT:    addi a6, a2, -256
+; RV32I-NEXT:    and a3, a3, a6
 ; RV32I-NEXT:    srli a4, a1, 24
-; RV32I-NEXT:    or a4, a3, a4
-; RV32I-NEXT:    slli a5, a1, 8
-; RV32I-NEXT:    lui t1, 4080
-; RV32I-NEXT:    and a5, a5, t1
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a4, a1, 8
+; RV32I-NEXT:    lui a7, 4080
+; RV32I-NEXT:    and a4, a4, a7
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    srli a3, a1, 4
 ; RV32I-NEXT:    lui a4, 61681
-; RV32I-NEXT:    addi a6, a4, -241
-; RV32I-NEXT:    and a5, a1, a6
-; RV32I-NEXT:    slli a5, a5, 4
-; RV32I-NEXT:    lui a4, 986895
-; RV32I-NEXT:    addi a7, a4, 240
-; RV32I-NEXT:    and a1, a1, a7
-; RV32I-NEXT:    srli a1, a1, 4
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    lui a5, 209715
-; RV32I-NEXT:    addi t2, a5, 819
-; RV32I-NEXT:    and a4, a1, t2
-; RV32I-NEXT:    slli a4, a4, 2
-; RV32I-NEXT:    lui a2, 838861
-; RV32I-NEXT:    addi t3, a2, -820
-; RV32I-NEXT:    and a1, a1, t3
-; RV32I-NEXT:    srli a1, a1, 2
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    lui a4, 349525
-; RV32I-NEXT:    addi a4, a4, 1365
-; RV32I-NEXT:    and a3, a1, a4
-; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    lui a5, 699051
-; RV32I-NEXT:    addi a5, a5, -1366
+; RV32I-NEXT:    addi t0, a4, -241
+; RV32I-NEXT:    and a3, a3, t0
+; RV32I-NEXT:    and a1, a1, t0
+; RV32I-NEXT:    slli a1, a1, 4
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    srli a3, a1, 2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    and a3, a3, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    srli a3, a1, 1
+; RV32I-NEXT:    lui a5, 349525
+; RV32I-NEXT:    addi a5, a5, 1365
+; RV32I-NEXT:    and a3, a3, a5
 ; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    srli a1, a1, 1
-; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    or a1, a3, a1
 ; RV32I-NEXT:    srli a3, a0, 8
-; RV32I-NEXT:    and a3, a3, t0
-; RV32I-NEXT:    srli a2, a0, 24
-; RV32I-NEXT:    or a2, a3, a2
-; RV32I-NEXT:    slli a3, a0, 8
-; RV32I-NEXT:    and a3, a3, t1
+; RV32I-NEXT:    and a3, a3, a6
+; RV32I-NEXT:    srli a4, a0, 24
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a4, a0, 8
+; RV32I-NEXT:    and a4, a4, a7
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    or a0, a0, a2
-; RV32I-NEXT:    and a2, a0, a6
-; RV32I-NEXT:    slli a2, a2, 4
-; RV32I-NEXT:    and a0, a0, a7
-; RV32I-NEXT:    srli a0, a0, 4
-; RV32I-NEXT:    or a0, a0, a2
-; RV32I-NEXT:    and a2, a0, t2
-; RV32I-NEXT:    slli a2, a2, 2
-; RV32I-NEXT:    and a0, a0, t3
-; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    or a0, a0, a2
-; RV32I-NEXT:    and a2, a0, a4
-; RV32I-NEXT:    slli a2, a2, 1
+; RV32I-NEXT:    srli a3, a0, 4
+; RV32I-NEXT:    and a3, a3, t0
+; RV32I-NEXT:    and a0, a0, t0
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    srli a3, a0, 2
+; RV32I-NEXT:    and a3, a3, a2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a2, a2, a5
 ; RV32I-NEXT:    and a0, a0, a5
-; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    or a0, a2, a0
 ; RV32I-NEXT:    srli a2, a0, 8
-; RV32I-NEXT:    and a2, a2, t0
+; RV32I-NEXT:    and a2, a2, a6
 ; RV32I-NEXT:    srli a3, a0, 24
 ; RV32I-NEXT:    or a2, a2, a3
 ; RV32I-NEXT:    slli a3, a0, 8
-; RV32I-NEXT:    and a3, a3, t1
+; RV32I-NEXT:    and a3, a3, a7
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    srli a2, a1, 8
-; RV32I-NEXT:    and a2, a2, t0
+; RV32I-NEXT:    and a2, a2, a6
 ; RV32I-NEXT:    srli a3, a1, 24
 ; RV32I-NEXT:    or a2, a2, a3
 ; RV32I-NEXT:    slli a3, a1, 8
-; RV32I-NEXT:    and a3, a3, t1
+; RV32I-NEXT:    and a3, a3, a7
 ; RV32I-NEXT:    slli a1, a1, 24
 ; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    or a1, a1, a2

diff  --git a/llvm/test/CodeGen/RISCV/rv64zbp.ll b/llvm/test/CodeGen/RISCV/rv64zbp.ll
index c83698b70b763..4c35a53e61db6 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbp.ll
@@ -2816,13 +2816,13 @@ define zeroext i8 @bitreverse_i8(i8 zeroext %a) nounwind {
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    andi a1, a0, 51
 ; RV64I-NEXT:    slli a1, a1, 2
-; RV64I-NEXT:    andi a0, a0, 204
 ; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    andi a0, a0, 51
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    andi a1, a0, 85
 ; RV64I-NEXT:    slli a1, a1, 1
-; RV64I-NEXT:    andi a0, a0, 170
 ; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    andi a0, a0, 85
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -2847,33 +2847,27 @@ define zeroext i16 @bitreverse_i16(i16 zeroext %a) nounwind {
 ; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    slli a0, a0, 8
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    lui a1, 1
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a1, a0, a1
-; RV64I-NEXT:    slli a1, a1, 4
-; RV64I-NEXT:    lui a2, 15
-; RV64I-NEXT:    addiw a2, a2, 240
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    lui a2, 1
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    srli a0, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    lui a1, 3
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a1, a0, a1
-; RV64I-NEXT:    slli a1, a1, 2
-; RV64I-NEXT:    lui a2, 13
-; RV64I-NEXT:    addiw a2, a2, -820
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    lui a2, 3
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    lui a1, 5
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    and a1, a0, a1
-; RV64I-NEXT:    slli a1, a1, 1
-; RV64I-NEXT:    lui a2, 11
-; RV64I-NEXT:    addiw a2, a2, -1366
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    lui a2, 5
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    srli a0, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64B-LABEL: bitreverse_i16:
@@ -2906,35 +2900,27 @@ define signext i32 @bitreverse_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a1, a0, a1
-; RV64I-NEXT:    slli a1, a1, 4
-; RV64I-NEXT:    lui a2, 241
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    addiw a2, a2, -241
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 240
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    srli a0, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a1, a0, a1
-; RV64I-NEXT:    slli a1, a1, 2
-; RV64I-NEXT:    lui a2, 838861
-; RV64I-NEXT:    addiw a2, a2, -820
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    and a1, a0, a1
-; RV64I-NEXT:    slli a1, a1, 1
-; RV64I-NEXT:    lui a2, 699051
-; RV64I-NEXT:    addiw a2, a2, -1366
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    srli a0, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    ret
 ;
@@ -2967,35 +2953,27 @@ define void @bitreverse_i32_nosext(i32 signext %a, i32* %x) nounwind {
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addiw a2, a2, -241
-; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 4
-; RV64I-NEXT:    lui a3, 241
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    lui a3, 61681
 ; RV64I-NEXT:    addiw a3, a3, -241
-; RV64I-NEXT:    slli a3, a3, 12
-; RV64I-NEXT:    addi a3, a3, 240
+; RV64I-NEXT:    and a2, a2, a3
 ; RV64I-NEXT:    and a0, a0, a3
-; RV64I-NEXT:    srli a0, a0, 4
-; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 2
-; RV64I-NEXT:    lui a3, 838861
-; RV64I-NEXT:    addiw a3, a3, -820
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    srli a2, a0, 2
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    addiw a3, a3, 819
+; RV64I-NEXT:    and a2, a2, a3
 ; RV64I-NEXT:    and a0, a0, a3
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 1
-; RV64I-NEXT:    lui a3, 699051
-; RV64I-NEXT:    addiw a3, a3, -1366
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    addiw a3, a3, 1365
+; RV64I-NEXT:    and a2, a2, a3
 ; RV64I-NEXT:    and a0, a0, a3
-; RV64I-NEXT:    srli a0, a0, 1
-; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    sw a0, 0(a1)
 ; RV64I-NEXT:    ret
 ;
@@ -3049,69 +3027,45 @@ define i64 @bitreverse_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    lui a1, 3855
-; RV64I-NEXT:    addiw a1, a1, 241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, -241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, 241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, -241
-; RV64I-NEXT:    and a1, a0, a1
-; RV64I-NEXT:    slli a1, a1, 4
-; RV64I-NEXT:    lui a2, 1044721
-; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    lui a2, 3855
+; RV64I-NEXT:    addiw a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
 ; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    addi a2, a2, 241
 ; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    addi a2, a2, -241
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 240
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    srli a0, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    lui a1, 13107
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, 819
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, 819
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, 819
-; RV64I-NEXT:    and a1, a0, a1
-; RV64I-NEXT:    slli a1, a1, 2
-; RV64I-NEXT:    lui a2, 1035469
-; RV64I-NEXT:    addiw a2, a2, -819
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    lui a2, 13107
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, -819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, -819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, -820
+; RV64I-NEXT:    addi a2, a2, 819
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    lui a1, 21845
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, 1365
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, 1365
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, 1365
-; RV64I-NEXT:    and a1, a0, a1
-; RV64I-NEXT:    slli a1, a1, 1
-; RV64I-NEXT:    lui a2, 1026731
-; RV64I-NEXT:    addiw a2, a2, -1365
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    lui a2, 21845
+; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, -1365
+; RV64I-NEXT:    addi a2, a2, 1365
 ; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, -1365
+; RV64I-NEXT:    addi a2, a2, 1365
 ; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, -1366
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    srli a0, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64B-LABEL: bitreverse_i64:
@@ -3210,35 +3164,27 @@ define i32 @bitreverse_bswap_i32(i32 %a) {
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a1, a0, a1
-; RV64I-NEXT:    slli a1, a1, 4
-; RV64I-NEXT:    lui a3, 241
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    lui a3, 61681
 ; RV64I-NEXT:    addiw a3, a3, -241
-; RV64I-NEXT:    slli a3, a3, 12
-; RV64I-NEXT:    addi a3, a3, 240
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    and a0, a0, a3
-; RV64I-NEXT:    srli a0, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a1, a0, a1
-; RV64I-NEXT:    slli a1, a1, 2
-; RV64I-NEXT:    lui a3, 838861
-; RV64I-NEXT:    addiw a3, a3, -820
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    addiw a3, a3, 819
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    and a0, a0, a3
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    and a1, a0, a1
-; RV64I-NEXT:    slli a1, a1, 1
-; RV64I-NEXT:    lui a3, 699051
-; RV64I-NEXT:    addiw a3, a3, -1366
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    addiw a3, a3, 1365
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    and a0, a0, a3
-; RV64I-NEXT:    srli a0, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a2, a0, 24
@@ -3267,14 +3213,14 @@ define i32 @bitreverse_bswap_i32(i32 %a) {
 define i64 @bitreverse_bswap_i64(i64 %a) {
 ; RV64I-LABEL: bitreverse_bswap_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    srli a2, a0, 24
 ; RV64I-NEXT:    lui a6, 4080
-; RV64I-NEXT:    and a1, a1, a6
-; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    and a3, a2, a6
+; RV64I-NEXT:    srli a4, a0, 8
 ; RV64I-NEXT:    addi a5, zero, 255
 ; RV64I-NEXT:    slli a7, a5, 24
-; RV64I-NEXT:    and a3, a3, a7
-; RV64I-NEXT:    or a3, a3, a1
+; RV64I-NEXT:    and a4, a4, a7
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    srli a4, a0, 40
 ; RV64I-NEXT:    lui a1, 16
 ; RV64I-NEXT:    addiw a1, a1, -256
@@ -3282,9 +3228,9 @@ define i64 @bitreverse_bswap_i64(i64 %a) {
 ; RV64I-NEXT:    srli a2, a0, 56
 ; RV64I-NEXT:    or a2, a4, a2
 ; RV64I-NEXT:    or a2, a3, a2
-; RV64I-NEXT:    slli a4, a0, 8
+; RV64I-NEXT:    slli a3, a0, 8
 ; RV64I-NEXT:    slli t0, a5, 32
-; RV64I-NEXT:    and a3, a4, t0
+; RV64I-NEXT:    and a3, a3, t0
 ; RV64I-NEXT:    slli a4, a0, 24
 ; RV64I-NEXT:    slli t1, a5, 40
 ; RV64I-NEXT:    and a4, a4, t1
@@ -3296,69 +3242,45 @@ define i64 @bitreverse_bswap_i64(i64 %a) {
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    lui a2, 3855
-; RV64I-NEXT:    addiw a2, a2, 241
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, -241
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 241
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, -241
-; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 4
-; RV64I-NEXT:    lui a3, 1044721
-; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    lui a3, 3855
+; RV64I-NEXT:    addiw a3, a3, 241
+; RV64I-NEXT:    slli a3, a3, 12
+; RV64I-NEXT:    addi a3, a3, -241
 ; RV64I-NEXT:    slli a3, a3, 12
 ; RV64I-NEXT:    addi a3, a3, 241
 ; RV64I-NEXT:    slli a3, a3, 12
 ; RV64I-NEXT:    addi a3, a3, -241
-; RV64I-NEXT:    slli a3, a3, 12
-; RV64I-NEXT:    addi a3, a3, 240
+; RV64I-NEXT:    and a2, a2, a3
 ; RV64I-NEXT:    and a0, a0, a3
-; RV64I-NEXT:    srli a0, a0, 4
-; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    lui a2, 13107
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 2
-; RV64I-NEXT:    lui a3, 1035469
-; RV64I-NEXT:    addiw a3, a3, -819
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    srli a2, a0, 2
+; RV64I-NEXT:    lui a3, 13107
+; RV64I-NEXT:    addiw a3, a3, 819
 ; RV64I-NEXT:    slli a3, a3, 12
-; RV64I-NEXT:    addi a3, a3, -819
+; RV64I-NEXT:    addi a3, a3, 819
 ; RV64I-NEXT:    slli a3, a3, 12
-; RV64I-NEXT:    addi a3, a3, -819
+; RV64I-NEXT:    addi a3, a3, 819
 ; RV64I-NEXT:    slli a3, a3, 12
-; RV64I-NEXT:    addi a3, a3, -820
+; RV64I-NEXT:    addi a3, a3, 819
+; RV64I-NEXT:    and a2, a2, a3
 ; RV64I-NEXT:    and a0, a0, a3
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    lui a2, 21845
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 1
-; RV64I-NEXT:    lui a3, 1026731
-; RV64I-NEXT:    addiw a3, a3, -1365
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    lui a3, 21845
+; RV64I-NEXT:    addiw a3, a3, 1365
 ; RV64I-NEXT:    slli a3, a3, 12
-; RV64I-NEXT:    addi a3, a3, -1365
+; RV64I-NEXT:    addi a3, a3, 1365
 ; RV64I-NEXT:    slli a3, a3, 12
-; RV64I-NEXT:    addi a3, a3, -1365
+; RV64I-NEXT:    addi a3, a3, 1365
 ; RV64I-NEXT:    slli a3, a3, 12
-; RV64I-NEXT:    addi a3, a3, -1366
+; RV64I-NEXT:    addi a3, a3, 1365
+; RV64I-NEXT:    and a2, a2, a3
 ; RV64I-NEXT:    and a0, a0, a3
-; RV64I-NEXT:    srli a0, a0, 1
-; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    srli a2, a0, 40
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    srli a2, a0, 56

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
index c1a9fe20aa93e..a3180f0b4e317 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
@@ -12,33 +12,27 @@ define void @bitreverse_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v25, 8
 ; LMULMAX2-RV32-NEXT:    vsll.vi v25, v25, 8
 ; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v25, 4
 ; LMULMAX2-RV32-NEXT:    lui a1, 1
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV32-NEXT:    vsll.vi v26, v26, 4
-; LMULMAX2-RV32-NEXT:    lui a1, 15
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 240
+; LMULMAX2-RV32-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV32-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX2-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v25, 2
 ; LMULMAX2-RV32-NEXT:    lui a1, 3
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV32-NEXT:    vsll.vi v26, v26, 2
-; LMULMAX2-RV32-NEXT:    lui a1, 13
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -820
+; LMULMAX2-RV32-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV32-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX2-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v25, 1
 ; LMULMAX2-RV32-NEXT:    lui a1, 5
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV32-NEXT:    vadd.vv v26, v26, v26
-; LMULMAX2-RV32-NEXT:    lui a1, 11
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -1366
+; LMULMAX2-RV32-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV32-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX2-RV32-NEXT:    vor.vv v25, v26, v25
 ; LMULMAX2-RV32-NEXT:    vse16.v v25, (a0)
 ; LMULMAX2-RV32-NEXT:    ret
 ;
@@ -49,33 +43,27 @@ define void @bitreverse_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v25, 8
 ; LMULMAX2-RV64-NEXT:    vsll.vi v25, v25, 8
 ; LMULMAX2-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v25, 4
 ; LMULMAX2-RV64-NEXT:    lui a1, 1
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT:    vsll.vi v26, v26, 4
-; LMULMAX2-RV64-NEXT:    lui a1, 15
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, 240
+; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX2-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX2-RV64-NEXT:    vor.vv v25, v26, v25
+; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v25, 2
 ; LMULMAX2-RV64-NEXT:    lui a1, 3
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT:    vsll.vi v26, v26, 2
-; LMULMAX2-RV64-NEXT:    lui a1, 13
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -820
+; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX2-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX2-RV64-NEXT:    vor.vv v25, v26, v25
+; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v25, 1
 ; LMULMAX2-RV64-NEXT:    lui a1, 5
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT:    vadd.vv v26, v26, v26
-; LMULMAX2-RV64-NEXT:    lui a1, 11
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -1366
+; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX2-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX2-RV64-NEXT:    vor.vv v25, v26, v25
 ; LMULMAX2-RV64-NEXT:    vse16.v v25, (a0)
 ; LMULMAX2-RV64-NEXT:    ret
 ;
@@ -86,33 +74,27 @@ define void @bitreverse_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v25, 8
 ; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 8
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v25, 4
 ; LMULMAX1-RV32-NEXT:    lui a1, 1
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX1-RV32-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV32-NEXT:    vsll.vi v26, v26, 4
-; LMULMAX1-RV32-NEXT:    lui a1, 15
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 240
+; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    lui a1, 3
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX1-RV32-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV32-NEXT:    vsll.vi v26, v26, 2
-; LMULMAX1-RV32-NEXT:    lui a1, 13
-; LMULMAX1-RV32-NEXT:    addi a1, a1, -820
+; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v25, 1
 ; LMULMAX1-RV32-NEXT:    lui a1, 5
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV32-NEXT:    vadd.vv v26, v26, v26
-; LMULMAX1-RV32-NEXT:    lui a1, 11
-; LMULMAX1-RV32-NEXT:    addi a1, a1, -1366
+; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v26, v25
 ; LMULMAX1-RV32-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    ret
 ;
@@ -123,33 +105,27 @@ define void @bitreverse_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v25, 8
 ; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 8
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    lui a1, 1
 ; LMULMAX1-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 4
-; LMULMAX1-RV64-NEXT:    lui a1, 15
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, 240
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    lui a1, 3
 ; LMULMAX1-RV64-NEXT:    addiw a1, a1, 819
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 2
-; LMULMAX1-RV64-NEXT:    lui a1, 13
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, -820
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v25, 1
 ; LMULMAX1-RV64-NEXT:    lui a1, 5
 ; LMULMAX1-RV64-NEXT:    addiw a1, a1, 1365
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT:    vadd.vv v26, v26, v26
-; LMULMAX1-RV64-NEXT:    lui a1, 11
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, -1366
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v26, v25
 ; LMULMAX1-RV64-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <8 x i16>, <8 x i16>* %x
@@ -177,33 +153,27 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX2-RV32-NEXT:    vsll.vi v25, v25, 24
 ; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v27
 ; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v25, 4
 ; LMULMAX2-RV32-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV32-NEXT:    vsll.vi v26, v26, 4
-; LMULMAX2-RV32-NEXT:    lui a1, 986895
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 240
+; LMULMAX2-RV32-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV32-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX2-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v25, 2
 ; LMULMAX2-RV32-NEXT:    lui a1, 209715
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV32-NEXT:    vsll.vi v26, v26, 2
-; LMULMAX2-RV32-NEXT:    lui a1, 838861
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -820
+; LMULMAX2-RV32-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV32-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX2-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v25, 1
 ; LMULMAX2-RV32-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV32-NEXT:    vadd.vv v26, v26, v26
-; LMULMAX2-RV32-NEXT:    lui a1, 699051
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -1366
+; LMULMAX2-RV32-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV32-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX2-RV32-NEXT:    vor.vv v25, v26, v25
 ; LMULMAX2-RV32-NEXT:    vse32.v v25, (a0)
 ; LMULMAX2-RV32-NEXT:    ret
 ;
@@ -223,39 +193,27 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    vsll.vi v25, v25, 24
 ; LMULMAX2-RV64-NEXT:    vor.vv v25, v25, v27
 ; LMULMAX2-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v25, 4
 ; LMULMAX2-RV64-NEXT:    lui a1, 61681
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT:    vsll.vi v26, v26, 4
-; LMULMAX2-RV64-NEXT:    lui a1, 241
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, 240
+; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX2-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX2-RV64-NEXT:    vor.vv v25, v26, v25
+; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v25, 2
 ; LMULMAX2-RV64-NEXT:    lui a1, 209715
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT:    vsll.vi v26, v26, 2
-; LMULMAX2-RV64-NEXT:    lui a1, 205
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -819
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -820
+; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX2-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX2-RV64-NEXT:    vor.vv v25, v26, v25
+; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v25, 1
 ; LMULMAX2-RV64-NEXT:    lui a1, 349525
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT:    vadd.vv v26, v26, v26
-; LMULMAX2-RV64-NEXT:    lui a1, 171
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -1365
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -1366
+; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX2-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX2-RV64-NEXT:    vor.vv v25, v26, v25
 ; LMULMAX2-RV64-NEXT:    vse32.v v25, (a0)
 ; LMULMAX2-RV64-NEXT:    ret
 ;
@@ -275,33 +233,27 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 24
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v27
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v25, 4
 ; LMULMAX1-RV32-NEXT:    lui a1, 61681
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX1-RV32-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV32-NEXT:    vsll.vi v26, v26, 4
-; LMULMAX1-RV32-NEXT:    lui a1, 986895
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 240
+; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    lui a1, 209715
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX1-RV32-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV32-NEXT:    vsll.vi v26, v26, 2
-; LMULMAX1-RV32-NEXT:    lui a1, 838861
-; LMULMAX1-RV32-NEXT:    addi a1, a1, -820
+; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v25, 1
 ; LMULMAX1-RV32-NEXT:    lui a1, 349525
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV32-NEXT:    vadd.vv v26, v26, v26
-; LMULMAX1-RV32-NEXT:    lui a1, 699051
-; LMULMAX1-RV32-NEXT:    addi a1, a1, -1366
+; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v26, v25
 ; LMULMAX1-RV32-NEXT:    vse32.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    ret
 ;
@@ -321,39 +273,27 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    lui a1, 61681
 ; LMULMAX1-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 4
-; LMULMAX1-RV64-NEXT:    lui a1, 241
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX1-RV64-NEXT:    addi a1, a1, 240
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    lui a1, 209715
 ; LMULMAX1-RV64-NEXT:    addiw a1, a1, 819
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 2
-; LMULMAX1-RV64-NEXT:    lui a1, 205
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, -819
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -820
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v25, 1
 ; LMULMAX1-RV64-NEXT:    lui a1, 349525
 ; LMULMAX1-RV64-NEXT:    addiw a1, a1, 1365
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT:    vadd.vv v26, v26, v26
-; LMULMAX1-RV64-NEXT:    lui a1, 171
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, -1365
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -1366
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v26, v25
 ; LMULMAX1-RV64-NEXT:    vse32.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <4 x i32>, <4 x i32>* %x
@@ -416,51 +356,36 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v28
 ; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v27
 ; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v25, 4
 ; LMULMAX2-RV32-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v26, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX2-RV32-NEXT:    vand.vv v26, v25, v26
-; LMULMAX2-RV32-NEXT:    vsll.vi v26, v26, 4
-; LMULMAX2-RV32-NEXT:    lui a1, 986895
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 240
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; LMULMAX2-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v27
 ; LMULMAX2-RV32-NEXT:    vand.vv v25, v25, v27
-; LMULMAX2-RV32-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX2-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v25, 2
 ; LMULMAX2-RV32-NEXT:    lui a1, 209715
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v26, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX2-RV32-NEXT:    vand.vv v26, v25, v26
-; LMULMAX2-RV32-NEXT:    vsll.vi v26, v26, 2
-; LMULMAX2-RV32-NEXT:    lui a1, 838861
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -820
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; LMULMAX2-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v27
 ; LMULMAX2-RV32-NEXT:    vand.vv v25, v25, v27
-; LMULMAX2-RV32-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX2-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v25, 1
 ; LMULMAX2-RV32-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v26, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX2-RV32-NEXT:    vand.vv v26, v25, v26
-; LMULMAX2-RV32-NEXT:    vadd.vv v26, v26, v26
-; LMULMAX2-RV32-NEXT:    lui a1, 699051
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -1366
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; LMULMAX2-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v27
 ; LMULMAX2-RV32-NEXT:    vand.vv v25, v25, v27
-; LMULMAX2-RV32-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX2-RV32-NEXT:    vor.vv v25, v26, v25
 ; LMULMAX2-RV32-NEXT:    vse64.v v25, (a0)
 ; LMULMAX2-RV32-NEXT:    ret
 ;
@@ -499,6 +424,7 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX2-RV64-NEXT:    vor.vv v25, v28, v25
 ; LMULMAX2-RV64-NEXT:    vor.vv v25, v25, v27
 ; LMULMAX2-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v25, 4
 ; LMULMAX2-RV64-NEXT:    lui a1, 3855
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 241
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
@@ -507,19 +433,11 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, 241
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT:    vsll.vi v26, v26, 4
-; LMULMAX2-RV64-NEXT:    lui a1, 1044721
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, 241
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, 240
+; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX2-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX2-RV64-NEXT:    vor.vv v25, v26, v25
+; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v25, 2
 ; LMULMAX2-RV64-NEXT:    lui a1, 13107
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 819
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
@@ -528,19 +446,11 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, 819
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT:    vsll.vi v26, v26, 2
-; LMULMAX2-RV64-NEXT:    lui a1, 1035469
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -819
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -819
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -819
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -820
+; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX2-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX2-RV64-NEXT:    vor.vv v25, v26, v25
+; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v25, 1
 ; LMULMAX2-RV64-NEXT:    lui a1, 21845
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 1365
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
@@ -549,19 +459,10 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, 1365
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT:    vadd.vv v26, v26, v26
-; LMULMAX2-RV64-NEXT:    lui a1, 1026731
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -1365
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -1365
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -1365
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -1366
+; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX2-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX2-RV64-NEXT:    vor.vv v25, v26, v25
 ; LMULMAX2-RV64-NEXT:    vse64.v v25, (a0)
 ; LMULMAX2-RV64-NEXT:    ret
 ;
@@ -616,51 +517,36 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v28
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v27
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v25, 4
 ; LMULMAX1-RV32-NEXT:    lui a1, 61681
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -241
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v26, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v26
-; LMULMAX1-RV32-NEXT:    vsll.vi v26, v26, 4
-; LMULMAX1-RV32-NEXT:    lui a1, 986895
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 240
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; LMULMAX1-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v27
 ; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    lui a1, 209715
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 819
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v26, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v26
-; LMULMAX1-RV32-NEXT:    vsll.vi v26, v26, 2
-; LMULMAX1-RV32-NEXT:    lui a1, 838861
-; LMULMAX1-RV32-NEXT:    addi a1, a1, -820
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; LMULMAX1-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v27
 ; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v25, 1
 ; LMULMAX1-RV32-NEXT:    lui a1, 349525
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 1365
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v26, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v26
-; LMULMAX1-RV32-NEXT:    vadd.vv v26, v26, v26
-; LMULMAX1-RV32-NEXT:    lui a1, 699051
-; LMULMAX1-RV32-NEXT:    addi a1, a1, -1366
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; LMULMAX1-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v27
 ; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v26, v25
 ; LMULMAX1-RV32-NEXT:    vse64.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    ret
 ;
@@ -699,6 +585,7 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v28, v25
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    lui a1, 3855
 ; LMULMAX1-RV64-NEXT:    addiw a1, a1, 241
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
@@ -707,19 +594,11 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, 241
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -241
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 4
-; LMULMAX1-RV64-NEXT:    lui a1, 1044721
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX1-RV64-NEXT:    addi a1, a1, 241
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -241
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX1-RV64-NEXT:    addi a1, a1, 240
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    lui a1, 13107
 ; LMULMAX1-RV64-NEXT:    addiw a1, a1, 819
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
@@ -728,19 +607,11 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, 819
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, 819
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 2
-; LMULMAX1-RV64-NEXT:    lui a1, 1035469
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, -819
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -819
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -819
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -820
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v25, 1
 ; LMULMAX1-RV64-NEXT:    lui a1, 21845
 ; LMULMAX1-RV64-NEXT:    addiw a1, a1, 1365
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
@@ -749,19 +620,10 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, 1365
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, 1365
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT:    vadd.vv v26, v26, v26
-; LMULMAX1-RV64-NEXT:    lui a1, 1026731
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, -1365
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -1365
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -1365
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -1366
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v26, v25
 ; LMULMAX1-RV64-NEXT:    vse64.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <2 x i64>, <2 x i64>* %x
@@ -780,33 +642,27 @@ define void @bitreverse_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV32-NEXT:    vsrl.vi v28, v26, 8
 ; LMULMAX2-RV32-NEXT:    vsll.vi v26, v26, 8
 ; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vsrl.vi v28, v26, 4
 ; LMULMAX2-RV32-NEXT:    lui a1, 1
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV32-NEXT:    vsll.vi v28, v28, 4
-; LMULMAX2-RV32-NEXT:    lui a1, 15
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 240
+; LMULMAX2-RV32-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV32-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v26, 4
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vsll.vi v26, v26, 4
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v28, v26
+; LMULMAX2-RV32-NEXT:    vsrl.vi v28, v26, 2
 ; LMULMAX2-RV32-NEXT:    lui a1, 3
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV32-NEXT:    vsll.vi v28, v28, 2
-; LMULMAX2-RV32-NEXT:    lui a1, 13
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -820
+; LMULMAX2-RV32-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV32-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v26, 2
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vsll.vi v26, v26, 2
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v28, v26
+; LMULMAX2-RV32-NEXT:    vsrl.vi v28, v26, 1
 ; LMULMAX2-RV32-NEXT:    lui a1, 5
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV32-NEXT:    vadd.vv v28, v28, v28
-; LMULMAX2-RV32-NEXT:    lui a1, 11
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -1366
+; LMULMAX2-RV32-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV32-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v26, 1
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vadd.vv v26, v26, v26
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v28, v26
 ; LMULMAX2-RV32-NEXT:    vse16.v v26, (a0)
 ; LMULMAX2-RV32-NEXT:    ret
 ;
@@ -817,150 +673,132 @@ define void @bitreverse_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    vsrl.vi v28, v26, 8
 ; LMULMAX2-RV64-NEXT:    vsll.vi v26, v26, 8
 ; LMULMAX2-RV64-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vsrl.vi v28, v26, 4
 ; LMULMAX2-RV64-NEXT:    lui a1, 1
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT:    vsll.vi v28, v28, 4
-; LMULMAX2-RV64-NEXT:    lui a1, 15
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, 240
+; LMULMAX2-RV64-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v26, 4
-; LMULMAX2-RV64-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vsll.vi v26, v26, 4
+; LMULMAX2-RV64-NEXT:    vor.vv v26, v28, v26
+; LMULMAX2-RV64-NEXT:    vsrl.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    lui a1, 3
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT:    vsll.vi v28, v28, 2
-; LMULMAX2-RV64-NEXT:    lui a1, 13
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -820
+; LMULMAX2-RV64-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v26, 2
-; LMULMAX2-RV64-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vsll.vi v26, v26, 2
+; LMULMAX2-RV64-NEXT:    vor.vv v26, v28, v26
+; LMULMAX2-RV64-NEXT:    vsrl.vi v28, v26, 1
 ; LMULMAX2-RV64-NEXT:    lui a1, 5
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT:    vadd.vv v28, v28, v28
-; LMULMAX2-RV64-NEXT:    lui a1, 11
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -1366
+; LMULMAX2-RV64-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v26, 1
-; LMULMAX2-RV64-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vadd.vv v26, v26, v26
+; LMULMAX2-RV64-NEXT:    vor.vv v26, v28, v26
 ; LMULMAX2-RV64-NEXT:    vse16.v v26, (a0)
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: bitreverse_v16i16:
 ; LMULMAX1-RV32:       # %bb.0:
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    addi a6, a0, 16
-; LMULMAX1-RV32-NEXT:    vle16.v v25, (a6)
+; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
+; LMULMAX1-RV32-NEXT:    vle16.v v25, (a1)
 ; LMULMAX1-RV32-NEXT:    vle16.v v26, (a0)
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v25, 8
 ; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 8
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v25, 4
 ; LMULMAX1-RV32-NEXT:    lui a2, 1
-; LMULMAX1-RV32-NEXT:    addi a7, a2, -241
-; LMULMAX1-RV32-NEXT:    vand.vx v27, v25, a7
-; LMULMAX1-RV32-NEXT:    vsll.vi v27, v27, 4
-; LMULMAX1-RV32-NEXT:    lui a3, 15
-; LMULMAX1-RV32-NEXT:    addi a3, a3, 240
-; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a3
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT:    lui a4, 3
-; LMULMAX1-RV32-NEXT:    addi a4, a4, 819
-; LMULMAX1-RV32-NEXT:    vand.vx v27, v25, a4
-; LMULMAX1-RV32-NEXT:    vsll.vi v27, v27, 2
-; LMULMAX1-RV32-NEXT:    lui a5, 13
-; LMULMAX1-RV32-NEXT:    addi a5, a5, -820
-; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a5
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT:    lui a1, 5
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT:    vand.vx v27, v25, a1
-; LMULMAX1-RV32-NEXT:    vadd.vv v27, v27, v27
-; LMULMAX1-RV32-NEXT:    lui a2, 11
-; LMULMAX1-RV32-NEXT:    addi a2, a2, -1366
+; LMULMAX1-RV32-NEXT:    addi a2, a2, -241
+; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a2
 ; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a2
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v27, v25
+; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v25, 2
+; LMULMAX1-RV32-NEXT:    lui a3, 3
+; LMULMAX1-RV32-NEXT:    addi a3, a3, 819
+; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a3
+; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v27, v25
+; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v25, 1
+; LMULMAX1-RV32-NEXT:    lui a4, 5
+; LMULMAX1-RV32-NEXT:    addi a4, a4, 1365
+; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a4
+; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a4
+; LMULMAX1-RV32-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v27, v25
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v26, 8
 ; LMULMAX1-RV32-NEXT:    vsll.vi v26, v26, 8
 ; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT:    vand.vx v27, v26, a7
-; LMULMAX1-RV32-NEXT:    vsll.vi v27, v27, 4
-; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a3
-; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v26, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT:    vand.vx v27, v26, a4
-; LMULMAX1-RV32-NEXT:    vsll.vi v27, v27, 2
-; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a5
-; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v26, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT:    vand.vx v27, v26, a1
-; LMULMAX1-RV32-NEXT:    vadd.vv v27, v27, v27
+; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v26, 4
+; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a2
 ; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a2
-; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v26, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v27
+; LMULMAX1-RV32-NEXT:    vsll.vi v26, v26, 4
+; LMULMAX1-RV32-NEXT:    vor.vv v26, v27, v26
+; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v26, 2
+; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a3
+; LMULMAX1-RV32-NEXT:    vsll.vi v26, v26, 2
+; LMULMAX1-RV32-NEXT:    vor.vv v26, v27, v26
+; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v26, 1
+; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a4
+; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a4
+; LMULMAX1-RV32-NEXT:    vadd.vv v26, v26, v26
+; LMULMAX1-RV32-NEXT:    vor.vv v26, v27, v26
 ; LMULMAX1-RV32-NEXT:    vse16.v v26, (a0)
-; LMULMAX1-RV32-NEXT:    vse16.v v25, (a6)
+; LMULMAX1-RV32-NEXT:    vse16.v v25, (a1)
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: bitreverse_v16i16:
 ; LMULMAX1-RV64:       # %bb.0:
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; LMULMAX1-RV64-NEXT:    addi a6, a0, 16
-; LMULMAX1-RV64-NEXT:    vle16.v v25, (a6)
+; LMULMAX1-RV64-NEXT:    addi a1, a0, 16
+; LMULMAX1-RV64-NEXT:    vle16.v v25, (a1)
 ; LMULMAX1-RV64-NEXT:    vle16.v v26, (a0)
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v25, 8
 ; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 8
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v25, 4
 ; LMULMAX1-RV64-NEXT:    lui a2, 1
-; LMULMAX1-RV64-NEXT:    addiw a7, a2, -241
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v25, a7
-; LMULMAX1-RV64-NEXT:    vsll.vi v27, v27, 4
-; LMULMAX1-RV64-NEXT:    lui a3, 15
-; LMULMAX1-RV64-NEXT:    addiw a3, a3, 240
-; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a3
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT:    lui a4, 3
-; LMULMAX1-RV64-NEXT:    addiw a4, a4, 819
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v25, a4
-; LMULMAX1-RV64-NEXT:    vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT:    lui a5, 13
-; LMULMAX1-RV64-NEXT:    addiw a5, a5, -820
-; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a5
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT:    lui a1, 5
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, 1365
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v25, a1
-; LMULMAX1-RV64-NEXT:    vadd.vv v27, v27, v27
-; LMULMAX1-RV64-NEXT:    lui a2, 11
-; LMULMAX1-RV64-NEXT:    addiw a2, a2, -1366
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -241
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a2
 ; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a2
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
+; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v27, v25
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v25, 2
+; LMULMAX1-RV64-NEXT:    lui a3, 3
+; LMULMAX1-RV64-NEXT:    addiw a3, a3, 819
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a3
+; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a3
+; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v27, v25
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v25, 1
+; LMULMAX1-RV64-NEXT:    lui a4, 5
+; LMULMAX1-RV64-NEXT:    addiw a4, a4, 1365
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a4
+; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a4
+; LMULMAX1-RV64-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v27, v25
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v26, 8
 ; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 8
 ; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v26, a7
-; LMULMAX1-RV64-NEXT:    vsll.vi v27, v27, 4
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a3
-; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v26, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v26, a4
-; LMULMAX1-RV64-NEXT:    vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a5
-; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v26, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v26, a1
-; LMULMAX1-RV64-NEXT:    vadd.vv v27, v27, v27
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v26, 4
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a2
 ; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a2
-; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v26, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v27
+; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 4
+; LMULMAX1-RV64-NEXT:    vor.vv v26, v27, v26
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v26, 2
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a3
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a3
+; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 2
+; LMULMAX1-RV64-NEXT:    vor.vv v26, v27, v26
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v26, 1
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a4
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a4
+; LMULMAX1-RV64-NEXT:    vadd.vv v26, v26, v26
+; LMULMAX1-RV64-NEXT:    vor.vv v26, v27, v26
 ; LMULMAX1-RV64-NEXT:    vse16.v v26, (a0)
-; LMULMAX1-RV64-NEXT:    vse16.v v25, (a6)
+; LMULMAX1-RV64-NEXT:    vse16.v v25, (a1)
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <16 x i16>, <16 x i16>* %x
   %b = load <16 x i16>, <16 x i16>* %y
@@ -987,33 +825,27 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV32-NEXT:    vsll.vi v26, v26, 24
 ; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v30
 ; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vsrl.vi v28, v26, 4
 ; LMULMAX2-RV32-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV32-NEXT:    vsll.vi v28, v28, 4
-; LMULMAX2-RV32-NEXT:    lui a1, 986895
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 240
+; LMULMAX2-RV32-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV32-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v26, 4
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vsll.vi v26, v26, 4
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v28, v26
+; LMULMAX2-RV32-NEXT:    vsrl.vi v28, v26, 2
 ; LMULMAX2-RV32-NEXT:    lui a1, 209715
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV32-NEXT:    vsll.vi v28, v28, 2
-; LMULMAX2-RV32-NEXT:    lui a1, 838861
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -820
+; LMULMAX2-RV32-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV32-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v26, 2
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vsll.vi v26, v26, 2
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v28, v26
+; LMULMAX2-RV32-NEXT:    vsrl.vi v28, v26, 1
 ; LMULMAX2-RV32-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV32-NEXT:    vadd.vv v28, v28, v28
-; LMULMAX2-RV32-NEXT:    lui a1, 699051
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -1366
+; LMULMAX2-RV32-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV32-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v26, 1
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vadd.vv v26, v26, v26
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v28, v26
 ; LMULMAX2-RV32-NEXT:    vse32.v v26, (a0)
 ; LMULMAX2-RV32-NEXT:    ret
 ;
@@ -1033,39 +865,27 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    vsll.vi v26, v26, 24
 ; LMULMAX2-RV64-NEXT:    vor.vv v26, v26, v30
 ; LMULMAX2-RV64-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vsrl.vi v28, v26, 4
 ; LMULMAX2-RV64-NEXT:    lui a1, 61681
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT:    vsll.vi v28, v28, 4
-; LMULMAX2-RV64-NEXT:    lui a1, 241
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, 240
+; LMULMAX2-RV64-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v26, 4
-; LMULMAX2-RV64-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vsll.vi v26, v26, 4
+; LMULMAX2-RV64-NEXT:    vor.vv v26, v28, v26
+; LMULMAX2-RV64-NEXT:    vsrl.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    lui a1, 209715
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT:    vsll.vi v28, v28, 2
-; LMULMAX2-RV64-NEXT:    lui a1, 205
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -819
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -820
+; LMULMAX2-RV64-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v26, 2
-; LMULMAX2-RV64-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vsll.vi v26, v26, 2
+; LMULMAX2-RV64-NEXT:    vor.vv v26, v28, v26
+; LMULMAX2-RV64-NEXT:    vsrl.vi v28, v26, 1
 ; LMULMAX2-RV64-NEXT:    lui a1, 349525
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT:    vadd.vv v28, v28, v28
-; LMULMAX2-RV64-NEXT:    lui a1, 171
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -1365
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -1366
+; LMULMAX2-RV64-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v26, 1
-; LMULMAX2-RV64-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vadd.vv v26, v26, v26
+; LMULMAX2-RV64-NEXT:    vor.vv v26, v28, v26
 ; LMULMAX2-RV64-NEXT:    vse32.v v26, (a0)
 ; LMULMAX2-RV64-NEXT:    ret
 ;
@@ -1077,67 +897,61 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV32-NEXT:    vle32.v v26, (a0)
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v25, 8
 ; LMULMAX1-RV32-NEXT:    lui a2, 16
-; LMULMAX1-RV32-NEXT:    addi a7, a2, -256
-; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a7
+; LMULMAX1-RV32-NEXT:    addi a2, a2, -256
+; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a2
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v28, v25, 24
 ; LMULMAX1-RV32-NEXT:    vor.vv v27, v27, v28
 ; LMULMAX1-RV32-NEXT:    vsll.vi v28, v25, 8
-; LMULMAX1-RV32-NEXT:    lui t0, 4080
-; LMULMAX1-RV32-NEXT:    vand.vx v28, v28, t0
+; LMULMAX1-RV32-NEXT:    lui a3, 4080
+; LMULMAX1-RV32-NEXT:    vand.vx v28, v28, a3
 ; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 24
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v28
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v25, 4
 ; LMULMAX1-RV32-NEXT:    lui a4, 61681
-; LMULMAX1-RV32-NEXT:    addi t1, a4, -241
-; LMULMAX1-RV32-NEXT:    vand.vx v27, v25, t1
-; LMULMAX1-RV32-NEXT:    vsll.vi v27, v27, 4
-; LMULMAX1-RV32-NEXT:    lui a5, 986895
-; LMULMAX1-RV32-NEXT:    addi a5, a5, 240
-; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a5
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT:    lui a1, 209715
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX1-RV32-NEXT:    vand.vx v27, v25, a1
-; LMULMAX1-RV32-NEXT:    vsll.vi v27, v27, 2
-; LMULMAX1-RV32-NEXT:    lui a2, 838861
-; LMULMAX1-RV32-NEXT:    addi a2, a2, -820
-; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a2
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT:    lui a3, 349525
-; LMULMAX1-RV32-NEXT:    addi a3, a3, 1365
-; LMULMAX1-RV32-NEXT:    vand.vx v27, v25, a3
-; LMULMAX1-RV32-NEXT:    vadd.vv v27, v27, v27
-; LMULMAX1-RV32-NEXT:    lui a4, 699051
-; LMULMAX1-RV32-NEXT:    addi a4, a4, -1366
+; LMULMAX1-RV32-NEXT:    addi a4, a4, -241
+; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a4
 ; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a4
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v27, v25
+; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v25, 2
+; LMULMAX1-RV32-NEXT:    lui a5, 209715
+; LMULMAX1-RV32-NEXT:    addi a5, a5, 819
+; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a5
+; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a5
+; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v27, v25
+; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v25, 1
+; LMULMAX1-RV32-NEXT:    lui a1, 349525
+; LMULMAX1-RV32-NEXT:    addi a1, a1, 1365
+; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a1
+; LMULMAX1-RV32-NEXT:    vand.vx v25, v25, a1
+; LMULMAX1-RV32-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v27, v25
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v26, 8
-; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a7
+; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a2
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v28, v26, 24
 ; LMULMAX1-RV32-NEXT:    vor.vv v27, v27, v28
 ; LMULMAX1-RV32-NEXT:    vsll.vi v28, v26, 8
-; LMULMAX1-RV32-NEXT:    vand.vx v28, v28, t0
+; LMULMAX1-RV32-NEXT:    vand.vx v28, v28, a3
 ; LMULMAX1-RV32-NEXT:    vsll.vi v26, v26, 24
 ; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v28
 ; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT:    vand.vx v27, v26, t1
-; LMULMAX1-RV32-NEXT:    vsll.vi v27, v27, 4
-; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a5
-; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v26, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT:    vand.vx v27, v26, a1
-; LMULMAX1-RV32-NEXT:    vsll.vi v27, v27, 2
-; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a2
-; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v26, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT:    vand.vx v27, v26, a3
-; LMULMAX1-RV32-NEXT:    vadd.vv v27, v27, v27
+; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v26, 4
+; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a4
 ; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a4
-; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v26, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v27
+; LMULMAX1-RV32-NEXT:    vsll.vi v26, v26, 4
+; LMULMAX1-RV32-NEXT:    vor.vv v26, v27, v26
+; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v26, 2
+; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a5
+; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a5
+; LMULMAX1-RV32-NEXT:    vsll.vi v26, v26, 2
+; LMULMAX1-RV32-NEXT:    vor.vv v26, v27, v26
+; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v26, 1
+; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a1
+; LMULMAX1-RV32-NEXT:    vand.vx v26, v26, a1
+; LMULMAX1-RV32-NEXT:    vadd.vv v26, v26, v26
+; LMULMAX1-RV32-NEXT:    vor.vv v26, v27, v26
 ; LMULMAX1-RV32-NEXT:    vse32.v v26, (a0)
 ; LMULMAX1-RV32-NEXT:    vse32.v v25, (a6)
 ; LMULMAX1-RV32-NEXT:    ret
@@ -1155,68 +969,56 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v28, v25, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v27, v27, v28
 ; LMULMAX1-RV64-NEXT:    vsll.vi v28, v25, 8
-; LMULMAX1-RV64-NEXT:    lui a7, 4080
-; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a7
+; LMULMAX1-RV64-NEXT:    lui a3, 4080
+; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a3
 ; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v28
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v25, 4
 ; LMULMAX1-RV64-NEXT:    lui a4, 61681
 ; LMULMAX1-RV64-NEXT:    addiw a4, a4, -241
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v25, a4
-; LMULMAX1-RV64-NEXT:    vsll.vi v27, v27, 4
-; LMULMAX1-RV64-NEXT:    lui a5, 241
-; LMULMAX1-RV64-NEXT:    addiw a5, a5, -241
-; LMULMAX1-RV64-NEXT:    slli a5, a5, 12
-; LMULMAX1-RV64-NEXT:    addi t0, a5, 240
-; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, t0
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT:    lui a1, 209715
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, 819
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v25, a1
-; LMULMAX1-RV64-NEXT:    vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT:    lui a3, 205
-; LMULMAX1-RV64-NEXT:    addiw a3, a3, -819
-; LMULMAX1-RV64-NEXT:    slli a3, a3, 12
-; LMULMAX1-RV64-NEXT:    addi t1, a3, -820
-; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, t1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT:    lui a5, 349525
-; LMULMAX1-RV64-NEXT:    addiw a5, a5, 1365
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v25, a5
-; LMULMAX1-RV64-NEXT:    vadd.vv v27, v27, v27
-; LMULMAX1-RV64-NEXT:    lui a3, 171
-; LMULMAX1-RV64-NEXT:    addiw a3, a3, -1365
-; LMULMAX1-RV64-NEXT:    slli a3, a3, 12
-; LMULMAX1-RV64-NEXT:    addi a3, a3, -1366
-; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a3
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a4
+; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a4
+; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v27, v25
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v25, 2
+; LMULMAX1-RV64-NEXT:    lui a5, 209715
+; LMULMAX1-RV64-NEXT:    addiw a5, a5, 819
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a5
+; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a5
+; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v27, v25
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v25, 1
+; LMULMAX1-RV64-NEXT:    lui a1, 349525
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, 1365
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a1
+; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a1
+; LMULMAX1-RV64-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v27, v25
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v26, 8
 ; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a2
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v28, v26, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v27, v27, v28
 ; LMULMAX1-RV64-NEXT:    vsll.vi v28, v26, 8
-; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a7
+; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a3
 ; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v28
 ; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v26, a4
-; LMULMAX1-RV64-NEXT:    vsll.vi v27, v27, 4
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, t0
-; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v26, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v26, a1
-; LMULMAX1-RV64-NEXT:    vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, t1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v26, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v26, a5
-; LMULMAX1-RV64-NEXT:    vadd.vv v27, v27, v27
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a3
-; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v26, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v27
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v26, 4
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a4
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a4
+; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 4
+; LMULMAX1-RV64-NEXT:    vor.vv v26, v27, v26
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v26, 2
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a5
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a5
+; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 2
+; LMULMAX1-RV64-NEXT:    vor.vv v26, v27, v26
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v26, 1
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a1
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a1
+; LMULMAX1-RV64-NEXT:    vadd.vv v26, v26, v26
+; LMULMAX1-RV64-NEXT:    vor.vv v26, v27, v26
 ; LMULMAX1-RV64-NEXT:    vse32.v v26, (a0)
 ; LMULMAX1-RV64-NEXT:    vse32.v v25, (a6)
 ; LMULMAX1-RV64-NEXT:    ret
@@ -1280,51 +1082,36 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v8
 ; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v30
 ; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vsrl.vi v28, v26, 4
 ; LMULMAX2-RV32-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
-; LMULMAX2-RV32-NEXT:    vand.vv v28, v26, v28
-; LMULMAX2-RV32-NEXT:    vsll.vi v28, v28, 4
-; LMULMAX2-RV32-NEXT:    lui a1, 986895
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 240
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; LMULMAX2-RV32-NEXT:    vmv.v.x v30, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX2-RV32-NEXT:    vand.vv v28, v28, v30
 ; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v30
-; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v26, 4
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vsll.vi v26, v26, 4
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v28, v26
+; LMULMAX2-RV32-NEXT:    vsrl.vi v28, v26, 2
 ; LMULMAX2-RV32-NEXT:    lui a1, 209715
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
-; LMULMAX2-RV32-NEXT:    vand.vv v28, v26, v28
-; LMULMAX2-RV32-NEXT:    vsll.vi v28, v28, 2
-; LMULMAX2-RV32-NEXT:    lui a1, 838861
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -820
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; LMULMAX2-RV32-NEXT:    vmv.v.x v30, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX2-RV32-NEXT:    vand.vv v28, v28, v30
 ; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v30
-; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v26, 2
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vsll.vi v26, v26, 2
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v28, v26
+; LMULMAX2-RV32-NEXT:    vsrl.vi v28, v26, 1
 ; LMULMAX2-RV32-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
-; LMULMAX2-RV32-NEXT:    vand.vv v28, v26, v28
-; LMULMAX2-RV32-NEXT:    vadd.vv v28, v28, v28
-; LMULMAX2-RV32-NEXT:    lui a1, 699051
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -1366
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; LMULMAX2-RV32-NEXT:    vmv.v.x v30, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX2-RV32-NEXT:    vand.vv v28, v28, v30
 ; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v30
-; LMULMAX2-RV32-NEXT:    vsrl.vi v26, v26, 1
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vadd.vv v26, v26, v26
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v28, v26
 ; LMULMAX2-RV32-NEXT:    vse64.v v26, (a0)
 ; LMULMAX2-RV32-NEXT:    ret
 ;
@@ -1363,6 +1150,7 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV64-NEXT:    vor.vv v26, v8, v26
 ; LMULMAX2-RV64-NEXT:    vor.vv v26, v26, v30
 ; LMULMAX2-RV64-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vsrl.vi v28, v26, 4
 ; LMULMAX2-RV64-NEXT:    lui a1, 3855
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 241
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
@@ -1371,19 +1159,11 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, 241
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV64-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT:    vsll.vi v28, v28, 4
-; LMULMAX2-RV64-NEXT:    lui a1, 1044721
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, 241
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, 240
+; LMULMAX2-RV64-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v26, 4
-; LMULMAX2-RV64-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vsll.vi v26, v26, 4
+; LMULMAX2-RV64-NEXT:    vor.vv v26, v28, v26
+; LMULMAX2-RV64-NEXT:    vsrl.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    lui a1, 13107
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 819
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
@@ -1392,19 +1172,11 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, 819
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV64-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT:    vsll.vi v28, v28, 2
-; LMULMAX2-RV64-NEXT:    lui a1, 1035469
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -819
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -819
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -819
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -820
+; LMULMAX2-RV64-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v26, 2
-; LMULMAX2-RV64-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vsll.vi v26, v26, 2
+; LMULMAX2-RV64-NEXT:    vor.vv v26, v28, v26
+; LMULMAX2-RV64-NEXT:    vsrl.vi v28, v26, 1
 ; LMULMAX2-RV64-NEXT:    lui a1, 21845
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 1365
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
@@ -1413,19 +1185,10 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, 1365
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV64-NEXT:    vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT:    vadd.vv v28, v28, v28
-; LMULMAX2-RV64-NEXT:    lui a1, 1026731
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -1365
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -1365
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -1365
-; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -1366
+; LMULMAX2-RV64-NEXT:    vand.vx v28, v28, a1
 ; LMULMAX2-RV64-NEXT:    vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v26, v26, 1
-; LMULMAX2-RV64-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vadd.vv v26, v26, v26
+; LMULMAX2-RV64-NEXT:    vor.vv v26, v28, v26
 ; LMULMAX2-RV64-NEXT:    vse64.v v26, (a0)
 ; LMULMAX2-RV64-NEXT:    ret
 ;
@@ -1433,17 +1196,17 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV32:       # %bb.0:
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; LMULMAX1-RV32-NEXT:    addi a6, a0, 16
-; LMULMAX1-RV32-NEXT:    vle64.v v30, (a6)
+; LMULMAX1-RV32-NEXT:    vle64.v v29, (a6)
 ; LMULMAX1-RV32-NEXT:    vle64.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    addi a2, zero, 56
-; LMULMAX1-RV32-NEXT:    vsrl.vx v26, v30, a2
+; LMULMAX1-RV32-NEXT:    vsrl.vx v26, v29, a2
 ; LMULMAX1-RV32-NEXT:    addi a3, zero, 40
-; LMULMAX1-RV32-NEXT:    vsrl.vx v27, v30, a3
+; LMULMAX1-RV32-NEXT:    vsrl.vx v27, v29, a3
 ; LMULMAX1-RV32-NEXT:    lui a4, 16
 ; LMULMAX1-RV32-NEXT:    addi a4, a4, -256
 ; LMULMAX1-RV32-NEXT:    vand.vx v27, v27, a4
 ; LMULMAX1-RV32-NEXT:    vor.vv v27, v27, v26
-; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v30, 24
+; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v29, 24
 ; LMULMAX1-RV32-NEXT:    lui a5, 4080
 ; LMULMAX1-RV32-NEXT:    vand.vx v28, v26, a5
 ; LMULMAX1-RV32-NEXT:    addi a1, zero, 5
@@ -1454,125 +1217,106 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV32-NEXT:    lui a1, 1044480
 ; LMULMAX1-RV32-NEXT:    vmerge.vxm v26, v26, a1, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vsrl.vi v29, v30, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v29, v29, v26
-; LMULMAX1-RV32-NEXT:    vor.vv v28, v29, v28
-; LMULMAX1-RV32-NEXT:    vor.vv v31, v28, v27
+; LMULMAX1-RV32-NEXT:    vsrl.vi v30, v29, 8
+; LMULMAX1-RV32-NEXT:    vand.vv v30, v30, v26
+; LMULMAX1-RV32-NEXT:    vor.vv v28, v30, v28
+; LMULMAX1-RV32-NEXT:    vor.vv v30, v28, v27
 ; LMULMAX1-RV32-NEXT:    addi a1, zero, 255
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; LMULMAX1-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX1-RV32-NEXT:    vmerge.vim v27, v27, 0, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vsll.vi v28, v30, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v29, v28, v27
+; LMULMAX1-RV32-NEXT:    vsll.vi v28, v29, 8
+; LMULMAX1-RV32-NEXT:    vand.vv v31, v28, v27
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; LMULMAX1-RV32-NEXT:    vmv.v.x v28, a4
 ; LMULMAX1-RV32-NEXT:    vmerge.vim v28, v28, 0, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vsll.vi v8, v30, 24
+; LMULMAX1-RV32-NEXT:    vsll.vi v8, v29, 24
 ; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v28
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v29
-; LMULMAX1-RV32-NEXT:    vsll.vx v9, v30, a3
+; LMULMAX1-RV32-NEXT:    vor.vv v31, v8, v31
+; LMULMAX1-RV32-NEXT:    vsll.vx v8, v29, a3
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v29, a5
-; LMULMAX1-RV32-NEXT:    vmerge.vim v29, v29, 0, v0
+; LMULMAX1-RV32-NEXT:    vmv.v.x v9, a5
+; LMULMAX1-RV32-NEXT:    vmerge.vim v9, v9, 0, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v29
-; LMULMAX1-RV32-NEXT:    vsll.vx v30, v30, a2
-; LMULMAX1-RV32-NEXT:    vor.vv v30, v30, v9
-; LMULMAX1-RV32-NEXT:    vor.vv v30, v30, v8
-; LMULMAX1-RV32-NEXT:    vor.vv v31, v30, v31
+; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v9
+; LMULMAX1-RV32-NEXT:    vsll.vx v29, v29, a2
+; LMULMAX1-RV32-NEXT:    vor.vv v29, v29, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v29, v29, v31
+; LMULMAX1-RV32-NEXT:    vor.vv v29, v29, v30
+; LMULMAX1-RV32-NEXT:    vsrl.vi v30, v29, 4
 ; LMULMAX1-RV32-NEXT:    lui a1, 61681
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -241
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v30, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v31, v30
-; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 4
-; LMULMAX1-RV32-NEXT:    lui a1, 986895
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 240
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v9, a1
+; LMULMAX1-RV32-NEXT:    vmv.v.x v31, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vand.vv v31, v31, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v31, v31, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v31, v31, v8
+; LMULMAX1-RV32-NEXT:    vand.vv v30, v30, v31
+; LMULMAX1-RV32-NEXT:    vand.vv v29, v29, v31
+; LMULMAX1-RV32-NEXT:    vsll.vi v29, v29, 4
+; LMULMAX1-RV32-NEXT:    vor.vv v29, v30, v29
+; LMULMAX1-RV32-NEXT:    vsrl.vi v30, v29, 2
 ; LMULMAX1-RV32-NEXT:    lui a1, 209715
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 819
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; LMULMAX1-RV32-NEXT:    vmv.v.x v8, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vand.vv v10, v31, v8
-; LMULMAX1-RV32-NEXT:    vsll.vi v10, v10, 2
-; LMULMAX1-RV32-NEXT:    lui a1, 838861
-; LMULMAX1-RV32-NEXT:    addi a1, a1, -820
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v11, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vand.vv v31, v31, v11
-; LMULMAX1-RV32-NEXT:    vsrl.vi v31, v31, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v31, v31, v10
+; LMULMAX1-RV32-NEXT:    vand.vv v30, v30, v8
+; LMULMAX1-RV32-NEXT:    vand.vv v29, v29, v8
+; LMULMAX1-RV32-NEXT:    vsll.vi v29, v29, 2
+; LMULMAX1-RV32-NEXT:    vor.vv v29, v30, v29
+; LMULMAX1-RV32-NEXT:    vsrl.vi v30, v29, 1
 ; LMULMAX1-RV32-NEXT:    lui a1, 349525
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 1365
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; LMULMAX1-RV32-NEXT:    vmv.v.x v10, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vand.vv v12, v31, v10
-; LMULMAX1-RV32-NEXT:    vadd.vv v12, v12, v12
-; LMULMAX1-RV32-NEXT:    lui a1, 699051
-; LMULMAX1-RV32-NEXT:    addi a1, a1, -1366
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v13, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vand.vv v31, v31, v13
-; LMULMAX1-RV32-NEXT:    vsrl.vi v31, v31, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v31, v31, v12
-; LMULMAX1-RV32-NEXT:    vsrl.vx v12, v25, a2
-; LMULMAX1-RV32-NEXT:    vsrl.vx v14, v25, a3
-; LMULMAX1-RV32-NEXT:    vand.vx v14, v14, a4
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v14, v12
-; LMULMAX1-RV32-NEXT:    vsrl.vi v14, v25, 24
-; LMULMAX1-RV32-NEXT:    vand.vx v14, v14, a5
-; LMULMAX1-RV32-NEXT:    vsrl.vi v15, v25, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v15, v26
-; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v14
-; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v12
-; LMULMAX1-RV32-NEXT:    vsll.vi v12, v25, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v27, v12, v27
-; LMULMAX1-RV32-NEXT:    vsll.vi v12, v25, 24
-; LMULMAX1-RV32-NEXT:    vand.vv v28, v12, v28
+; LMULMAX1-RV32-NEXT:    vand.vv v30, v30, v10
+; LMULMAX1-RV32-NEXT:    vand.vv v29, v29, v10
+; LMULMAX1-RV32-NEXT:    vadd.vv v29, v29, v29
+; LMULMAX1-RV32-NEXT:    vor.vv v29, v30, v29
+; LMULMAX1-RV32-NEXT:    vsrl.vx v30, v25, a2
+; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v25, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a4
+; LMULMAX1-RV32-NEXT:    vor.vv v30, v11, v30
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v25, 24
+; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a5
+; LMULMAX1-RV32-NEXT:    vsrl.vi v12, v25, 8
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v12, v26
+; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v11
+; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v30
+; LMULMAX1-RV32-NEXT:    vsll.vi v30, v25, 8
+; LMULMAX1-RV32-NEXT:    vand.vv v27, v30, v27
+; LMULMAX1-RV32-NEXT:    vsll.vi v30, v25, 24
+; LMULMAX1-RV32-NEXT:    vand.vv v28, v30, v28
 ; LMULMAX1-RV32-NEXT:    vor.vv v27, v28, v27
 ; LMULMAX1-RV32-NEXT:    vsll.vx v28, v25, a3
-; LMULMAX1-RV32-NEXT:    vand.vv v28, v28, v29
+; LMULMAX1-RV32-NEXT:    vand.vv v28, v28, v9
 ; LMULMAX1-RV32-NEXT:    vsll.vx v25, v25, a2
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v28
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v27
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v30
-; LMULMAX1-RV32-NEXT:    vsll.vi v26, v26, 4
-; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v8
-; LMULMAX1-RV32-NEXT:    vsll.vi v26, v26, 2
-; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v11
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v10
-; LMULMAX1-RV32-NEXT:    vadd.vv v26, v26, v26
-; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v13
-; LMULMAX1-RV32-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v25, 4
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v31
+; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v31
+; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v25, 2
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v8
+; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v8
+; LMULMAX1-RV32-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT:    vsrl.vi v26, v25, 1
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v10
+; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v10
+; LMULMAX1-RV32-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v26, v25
 ; LMULMAX1-RV32-NEXT:    vse64.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    vse64.v v31, (a6)
+; LMULMAX1-RV32-NEXT:    vse64.v v29, (a6)
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: bitreverse_v4i64:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 16
-; LMULMAX1-RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
-; LMULMAX1-RV64-NEXT:    .cfi_offset s0, -8
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; LMULMAX1-RV64-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV64-NEXT:    vle64.v v26, (a6)
@@ -1581,33 +1325,34 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    vsrl.vx v27, v26, t0
 ; LMULMAX1-RV64-NEXT:    addi t1, zero, 40
 ; LMULMAX1-RV64-NEXT:    vsrl.vx v28, v26, t1
-; LMULMAX1-RV64-NEXT:    lui a1, 16
-; LMULMAX1-RV64-NEXT:    addiw t4, a1, -256
-; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, t4
+; LMULMAX1-RV64-NEXT:    lui a4, 16
+; LMULMAX1-RV64-NEXT:    addiw t2, a4, -256
+; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, t2
 ; LMULMAX1-RV64-NEXT:    vor.vv v27, v28, v27
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v28, v26, 24
 ; LMULMAX1-RV64-NEXT:    lui a7, 4080
 ; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a7
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v29, v26, 8
-; LMULMAX1-RV64-NEXT:    addi a3, zero, 255
-; LMULMAX1-RV64-NEXT:    slli a1, a3, 24
-; LMULMAX1-RV64-NEXT:    vand.vx v29, v29, a1
+; LMULMAX1-RV64-NEXT:    addi a1, zero, 255
+; LMULMAX1-RV64-NEXT:    slli t4, a1, 24
+; LMULMAX1-RV64-NEXT:    vand.vx v29, v29, t4
 ; LMULMAX1-RV64-NEXT:    vor.vv v28, v29, v28
 ; LMULMAX1-RV64-NEXT:    vor.vv v27, v28, v27
 ; LMULMAX1-RV64-NEXT:    vsll.vi v28, v26, 8
-; LMULMAX1-RV64-NEXT:    slli a5, a3, 32
-; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a5
+; LMULMAX1-RV64-NEXT:    slli a2, a1, 32
+; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a2
 ; LMULMAX1-RV64-NEXT:    vsll.vi v29, v26, 24
-; LMULMAX1-RV64-NEXT:    slli a2, a3, 40
-; LMULMAX1-RV64-NEXT:    vand.vx v29, v29, a2
+; LMULMAX1-RV64-NEXT:    slli a3, a1, 40
+; LMULMAX1-RV64-NEXT:    vand.vx v29, v29, a3
 ; LMULMAX1-RV64-NEXT:    vor.vv v28, v29, v28
 ; LMULMAX1-RV64-NEXT:    vsll.vx v29, v26, t0
 ; LMULMAX1-RV64-NEXT:    vsll.vx v26, v26, t1
-; LMULMAX1-RV64-NEXT:    slli a3, a3, 48
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a3
+; LMULMAX1-RV64-NEXT:    slli a1, a1, 48
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV64-NEXT:    vor.vv v26, v29, v26
 ; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v28
 ; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v27
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v26, 4
 ; LMULMAX1-RV64-NEXT:    lui a4, 3855
 ; LMULMAX1-RV64-NEXT:    addiw a4, a4, 241
 ; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
@@ -1615,20 +1360,12 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
 ; LMULMAX1-RV64-NEXT:    addi a4, a4, 241
 ; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
-; LMULMAX1-RV64-NEXT:    addi t2, a4, -241
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v26, t2
-; LMULMAX1-RV64-NEXT:    vsll.vi v27, v27, 4
-; LMULMAX1-RV64-NEXT:    lui a4, 1044721
-; LMULMAX1-RV64-NEXT:    addiw a4, a4, -241
-; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
-; LMULMAX1-RV64-NEXT:    addi a4, a4, 241
-; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
-; LMULMAX1-RV64-NEXT:    addi a4, a4, -241
-; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
-; LMULMAX1-RV64-NEXT:    addi t3, a4, 240
+; LMULMAX1-RV64-NEXT:    addi t3, a4, -241
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, t3
 ; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, t3
-; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v26, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v27
+; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 4
+; LMULMAX1-RV64-NEXT:    vor.vv v26, v27, v26
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v26, 2
 ; LMULMAX1-RV64-NEXT:    lui a4, 13107
 ; LMULMAX1-RV64-NEXT:    addiw a4, a4, 819
 ; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
@@ -1636,81 +1373,62 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
 ; LMULMAX1-RV64-NEXT:    addi a4, a4, 819
 ; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
-; LMULMAX1-RV64-NEXT:    addi t5, a4, 819
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v26, t5
-; LMULMAX1-RV64-NEXT:    vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT:    lui a4, 1035469
-; LMULMAX1-RV64-NEXT:    addiw a4, a4, -819
-; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
-; LMULMAX1-RV64-NEXT:    addi a4, a4, -819
-; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
-; LMULMAX1-RV64-NEXT:    addi a4, a4, -819
-; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
-; LMULMAX1-RV64-NEXT:    addi t6, a4, -820
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, t6
-; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v26, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT:    lui a4, 21845
-; LMULMAX1-RV64-NEXT:    addiw a4, a4, 1365
-; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
-; LMULMAX1-RV64-NEXT:    addi a4, a4, 1365
-; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
-; LMULMAX1-RV64-NEXT:    addi a4, a4, 1365
-; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
-; LMULMAX1-RV64-NEXT:    addi a4, a4, 1365
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v26, a4
-; LMULMAX1-RV64-NEXT:    vadd.vv v27, v27, v27
-; LMULMAX1-RV64-NEXT:    lui s0, 1026731
-; LMULMAX1-RV64-NEXT:    addiw s0, s0, -1365
-; LMULMAX1-RV64-NEXT:    slli s0, s0, 12
-; LMULMAX1-RV64-NEXT:    addi s0, s0, -1365
-; LMULMAX1-RV64-NEXT:    slli s0, s0, 12
-; LMULMAX1-RV64-NEXT:    addi s0, s0, -1365
-; LMULMAX1-RV64-NEXT:    slli s0, s0, 12
-; LMULMAX1-RV64-NEXT:    addi s0, s0, -1366
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, s0
-; LMULMAX1-RV64-NEXT:    vsrl.vi v26, v26, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v27
+; LMULMAX1-RV64-NEXT:    addi a4, a4, 819
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a4
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a4
+; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 2
+; LMULMAX1-RV64-NEXT:    vor.vv v26, v27, v26
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v26, 1
+; LMULMAX1-RV64-NEXT:    lui a5, 21845
+; LMULMAX1-RV64-NEXT:    addiw a5, a5, 1365
+; LMULMAX1-RV64-NEXT:    slli a5, a5, 12
+; LMULMAX1-RV64-NEXT:    addi a5, a5, 1365
+; LMULMAX1-RV64-NEXT:    slli a5, a5, 12
+; LMULMAX1-RV64-NEXT:    addi a5, a5, 1365
+; LMULMAX1-RV64-NEXT:    slli a5, a5, 12
+; LMULMAX1-RV64-NEXT:    addi a5, a5, 1365
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a5
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a5
+; LMULMAX1-RV64-NEXT:    vadd.vv v26, v26, v26
+; LMULMAX1-RV64-NEXT:    vor.vv v26, v27, v26
 ; LMULMAX1-RV64-NEXT:    vsrl.vx v27, v25, t0
 ; LMULMAX1-RV64-NEXT:    vsrl.vx v28, v25, t1
-; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, t4
+; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, t2
 ; LMULMAX1-RV64-NEXT:    vor.vv v27, v28, v27
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v28, v25, 24
 ; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a7
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v29, v25, 8
-; LMULMAX1-RV64-NEXT:    vand.vx v29, v29, a1
+; LMULMAX1-RV64-NEXT:    vand.vx v29, v29, t4
 ; LMULMAX1-RV64-NEXT:    vor.vv v28, v29, v28
 ; LMULMAX1-RV64-NEXT:    vor.vv v27, v28, v27
 ; LMULMAX1-RV64-NEXT:    vsll.vi v28, v25, 8
-; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a5
+; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a2
 ; LMULMAX1-RV64-NEXT:    vsll.vi v29, v25, 24
-; LMULMAX1-RV64-NEXT:    vand.vx v29, v29, a2
+; LMULMAX1-RV64-NEXT:    vand.vx v29, v29, a3
 ; LMULMAX1-RV64-NEXT:    vor.vv v28, v29, v28
 ; LMULMAX1-RV64-NEXT:    vsll.vx v29, v25, t0
 ; LMULMAX1-RV64-NEXT:    vsll.vx v25, v25, t1
-; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a3
+; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a1
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v29, v25
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v28
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v25, t2
-; LMULMAX1-RV64-NEXT:    vsll.vi v27, v27, 4
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v25, 4
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, t3
 ; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, t3
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v25, t5
-; LMULMAX1-RV64-NEXT:    vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, t6
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v25, a4
-; LMULMAX1-RV64-NEXT:    vadd.vv v27, v27, v27
-; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, s0
-; LMULMAX1-RV64-NEXT:    vsrl.vi v25, v25, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
+; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 4
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v27, v25
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v25, 2
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a4
+; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a4
+; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 2
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v27, v25
+; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v25, 1
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a5
+; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a5
+; LMULMAX1-RV64-NEXT:    vadd.vv v25, v25, v25
+; LMULMAX1-RV64-NEXT:    vor.vv v25, v27, v25
 ; LMULMAX1-RV64-NEXT:    vse64.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vse64.v v26, (a6)
-; LMULMAX1-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 16
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <4 x i64>, <4 x i64>* %x
   %b = load <4 x i64>, <4 x i64>* %y

diff  --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index b1c4cbead6c15..cfdbbce7f1f56 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -17,35 +17,35 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $3855, %edx # imm = 0xF0F
 ; X86-NEXT:    shll $4, %edx
-; X86-NEXT:    andl $61680, %eax # imm = 0xF0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $3855, %eax # imm = 0xF0F
 ; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $13107, %edx # imm = 0x3333
-; X86-NEXT:    andl $52428, %eax # imm = 0xCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $13107, %eax # imm = 0x3333
 ; X86-NEXT:    leal (%eax,%edx,4), %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $21845, %edx # imm = 0x5555
-; X86-NEXT:    andl $43690, %eax # imm = 0xAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $21845, %eax # imm = 0x5555
 ; X86-NEXT:    leal (%eax,%edx,2), %eax
 ; X86-NEXT:    rolw $8, %cx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $3855, %edx # imm = 0xF0F
 ; X86-NEXT:    shll $4, %edx
-; X86-NEXT:    andl $61680, %ecx # imm = 0xF0F0
 ; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    andl $3855, %ecx # imm = 0xF0F
 ; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $13107, %edx # imm = 0x3333
-; X86-NEXT:    andl $52428, %ecx # imm = 0xCCCC
 ; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    andl $13107, %ecx # imm = 0x3333
 ; X86-NEXT:    leal (%ecx,%edx,4), %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $21845, %edx # imm = 0x5555
-; X86-NEXT:    andl $43690, %ecx # imm = 0xAAAA
 ; X86-NEXT:    shrl %ecx
+; X86-NEXT:    andl $21845, %ecx # imm = 0x5555
 ; X86-NEXT:    leal (%ecx,%edx,2), %edx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    # kill: def $dx killed $dx killed $edx
@@ -63,16 +63,18 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
 ; X64-NEXT:    psrlw $4, %xmm0
 ; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-NEXT:    por %xmm1, %xmm0
-; X64-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; X64-NEXT:    pand %xmm0, %xmm1
-; X64-NEXT:    psllw $2, %xmm1
-; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    psrlw $2, %xmm0
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psrlw $2, %xmm1
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X64-NEXT:    pand %xmm2, %xmm1
+; X64-NEXT:    pand %xmm2, %xmm0
+; X64-NEXT:    psllw $2, %xmm0
 ; X64-NEXT:    por %xmm1, %xmm0
-; X64-NEXT:    movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; X64-NEXT:    pand %xmm0, %xmm1
+; X64-NEXT:    movdqa %xmm0, %xmm1
 ; X64-NEXT:    psrlw $1, %xmm1
-; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; X64-NEXT:    pand %xmm2, %xmm1
+; X64-NEXT:    pand %xmm2, %xmm0
 ; X64-NEXT:    paddb %xmm0, %xmm0
 ; X64-NEXT:    por %xmm1, %xmm0
 ; X64-NEXT:    retq
@@ -96,60 +98,60 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %edx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%edx,4), %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%edx,2), %eax
 ; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %edx
-; X86-NEXT:    andl $-252645136, %ecx # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %ecx # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
 ; X86-NEXT:    leal (%ecx,%edx,4), %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %ecx # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    leal (%ecx,%edx,2), %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_bitreverse_i64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    bswapq %rdi
-; X64-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rdi, %rax
-; X64-NEXT:    shlq $4, %rax
-; X64-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; X64-NEXT:    andq %rdi, %rcx
-; X64-NEXT:    shrq $4, %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $4, %rax
+; X64-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
 ; X64-NEXT:    andq %rcx, %rax
-; X64-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    shrq $2, %rdx
-; X64-NEXT:    leaq (%rdx,%rax,4), %rax
-; X64-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-NEXT:    andq %rcx, %rdi
+; X64-NEXT:    shlq $4, %rdi
+; X64-NEXT:    orq %rax, %rdi
+; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-NEXT:    movq %rdi, %rcx
 ; X64-NEXT:    andq %rax, %rcx
-; X64-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; X64-NEXT:    andq %rax, %rdx
-; X64-NEXT:    shrq %rdx
-; X64-NEXT:    leaq (%rdx,%rcx,2), %rax
+; X64-NEXT:    shrq $2, %rdi
+; X64-NEXT:    andq %rax, %rdi
+; X64-NEXT:    leaq (%rdi,%rcx,4), %rax
+; X64-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    andq %rcx, %rdx
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    leaq (%rax,%rdx,2), %rax
 ; X64-NEXT:    retq
 ;
 ; X86XOP-LABEL: test_bitreverse_i64:
@@ -173,18 +175,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    retl
 ;
@@ -195,18 +197,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X64-NEXT:    shll $4, %eax
-; X64-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
 ; X64-NEXT:    shrl $4, %edi
+; X64-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X64-NEXT:    orl %eax, %edi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
 ; X64-NEXT:    shrl $2, %edi
+; X64-NEXT:    andl $858993459, %edi # imm = 0x33333333
 ; X64-NEXT:    leal (%rdi,%rax,4), %eax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X64-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X64-NEXT:    shrl %eax
+; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X64-NEXT:    leal (%rax,%rcx,2), %eax
 ; X64-NEXT:    retq
 ;
@@ -230,18 +232,18 @@ define i24 @test_bitreverse_i24(i24 %a) nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $1431655680, %ecx # imm = 0x55555500
-; X86-NEXT:    andl $-1431655936, %eax # imm = 0xAAAAAA00
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655680, %eax # imm = 0x55555500
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    shrl $8, %eax
 ; X86-NEXT:    retl
@@ -253,18 +255,18 @@ define i24 @test_bitreverse_i24(i24 %a) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X64-NEXT:    shll $4, %eax
-; X64-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
 ; X64-NEXT:    shrl $4, %edi
+; X64-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X64-NEXT:    orl %eax, %edi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
 ; X64-NEXT:    shrl $2, %edi
+; X64-NEXT:    andl $858993459, %edi # imm = 0x33333333
 ; X64-NEXT:    leal (%rdi,%rax,4), %eax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $1431655680, %ecx # imm = 0x55555500
-; X64-NEXT:    andl $-1431655936, %eax # imm = 0xAAAAAA00
 ; X64-NEXT:    shrl %eax
+; X64-NEXT:    andl $1431655680, %eax # imm = 0x55555500
 ; X64-NEXT:    leal (%rax,%rcx,2), %eax
 ; X64-NEXT:    shrl $8, %eax
 ; X64-NEXT:    retq
@@ -290,18 +292,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $3855, %ecx # imm = 0xF0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $61680, %eax # imm = 0xF0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $3855, %eax # imm = 0xF0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $13107, %ecx # imm = 0x3333
-; X86-NEXT:    andl $52428, %eax # imm = 0xCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $13107, %eax # imm = 0x3333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $21845, %ecx # imm = 0x5555
-; X86-NEXT:    andl $43690, %eax # imm = 0xAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $21845, %eax # imm = 0x5555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
@@ -313,18 +315,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    andl $3855, %eax # imm = 0xF0F
 ; X64-NEXT:    shll $4, %eax
-; X64-NEXT:    andl $61680, %edi # imm = 0xF0F0
 ; X64-NEXT:    shrl $4, %edi
+; X64-NEXT:    andl $3855, %edi # imm = 0xF0F
 ; X64-NEXT:    orl %eax, %edi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    andl $13107, %eax # imm = 0x3333
-; X64-NEXT:    andl $52428, %edi # imm = 0xCCCC
 ; X64-NEXT:    shrl $2, %edi
+; X64-NEXT:    andl $13107, %edi # imm = 0x3333
 ; X64-NEXT:    leal (%rdi,%rax,4), %eax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $21845, %ecx # imm = 0x5555
-; X64-NEXT:    andl $43690, %eax # imm = 0xAAAA
 ; X64-NEXT:    shrl %eax
+; X64-NEXT:    andl $21845, %eax # imm = 0x5555
 ; X64-NEXT:    leal (%rax,%rcx,2), %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
@@ -350,14 +352,14 @@ define i8 @test_bitreverse_i8(i8 %a) {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andb $51, %cl
 ; X86-NEXT:    shlb $2, %cl
-; X86-NEXT:    andb $-52, %al
 ; X86-NEXT:    shrb $2, %al
+; X86-NEXT:    andb $51, %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andb $85, %cl
 ; X86-NEXT:    addb %cl, %cl
-; X86-NEXT:    andb $-86, %al
 ; X86-NEXT:    shrb %al
+; X86-NEXT:    andb $85, %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    retl
 ;
@@ -368,14 +370,14 @@ define i8 @test_bitreverse_i8(i8 %a) {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    andb $51, %al
 ; X64-NEXT:    shlb $2, %al
-; X64-NEXT:    andb $-52, %dil
 ; X64-NEXT:    shrb $2, %dil
+; X64-NEXT:    andb $51, %dil
 ; X64-NEXT:    orb %al, %dil
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    andb $85, %al
 ; X64-NEXT:    addb %al, %al
-; X64-NEXT:    andb $-86, %dil
 ; X64-NEXT:    shrb %dil
+; X64-NEXT:    andb $85, %dil
 ; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -401,14 +403,14 @@ define i4 @test_bitreverse_i4(i4 %a) {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andb $51, %cl
 ; X86-NEXT:    shlb $2, %cl
-; X86-NEXT:    andb $-52, %al
 ; X86-NEXT:    shrb $2, %al
+; X86-NEXT:    andb $51, %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andb $80, %cl
 ; X86-NEXT:    addb %cl, %cl
-; X86-NEXT:    andb $-96, %al
 ; X86-NEXT:    shrb %al
+; X86-NEXT:    andb $80, %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    shrb $4, %al
 ; X86-NEXT:    retl
@@ -420,14 +422,14 @@ define i4 @test_bitreverse_i4(i4 %a) {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    andb $51, %al
 ; X64-NEXT:    shlb $2, %al
-; X64-NEXT:    andb $-52, %dil
 ; X64-NEXT:    shrb $2, %dil
+; X64-NEXT:    andb $51, %dil
 ; X64-NEXT:    orb %al, %dil
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    andb $80, %al
 ; X64-NEXT:    addb %al, %al
-; X64-NEXT:    andb $-96, %dil
 ; X64-NEXT:    shrb %dil
+; X64-NEXT:    andb $80, %dil
 ; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    shrb $4, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
@@ -621,107 +623,107 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    andl $252645135, %ebp # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ebp
-; X86-NEXT:    andl $-252645136, %ebx # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %ebx
+; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ebp, %ebx
 ; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    andl $858993459, %ebp # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %ebx # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %ebx
+; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
 ; X86-NEXT:    leal (%ebx,%ebp,4), %ebx
 ; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    andl $1431633920, %ebp # imm = 0x55550000
-; X86-NEXT:    andl $-1431699456, %ebx # imm = 0xAAAA0000
 ; X86-NEXT:    shrl %ebx
+; X86-NEXT:    andl $1431633920, %ebx # imm = 0x55550000
 ; X86-NEXT:    leal (%ebx,%ebp,2), %ebx
 ; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X86-NEXT:    bswapl %edi
 ; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ebx
-; X86-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %edi
+; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %edi
+; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
 ; X86-NEXT:    leal (%edi,%ebx,4), %edi
 ; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %edi # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %edi
+; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
 ; X86-NEXT:    leal (%edi,%ebx,2), %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %edi
-; X86-NEXT:    andl $-252645136, %esi # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %edi, %esi
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %esi # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NEXT:    leal (%esi,%edi,4), %esi
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %esi # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
 ; X86-NEXT:    leal (%esi,%edi,2), %ebx
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %esi
-; X86-NEXT:    andl $-252645136, %edx # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %edx # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NEXT:    leal (%edx,%esi,4), %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %edx # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
 ; X86-NEXT:    leal (%edx,%esi,2), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %edx
-; X86-NEXT:    andl $-252645136, %ecx # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %ecx # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
 ; X86-NEXT:    leal (%ecx,%edx,4), %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %ecx # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    leal (%ecx,%edx,2), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -729,18 +731,18 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -748,18 +750,18 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -767,18 +769,18 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -786,18 +788,18 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -805,18 +807,18 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -824,18 +826,18 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -843,18 +845,18 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -862,18 +864,18 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -881,18 +883,18 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -900,36 +902,36 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
 ; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
 ; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %edx
 ; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -1018,194 +1020,186 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movq %rdi, %r12
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; X64-NEXT:    bswapq %rbx
-; X64-NEXT:    movabsq $1085102592571150095, %r13 # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    movq %rbx, %r10
-; X64-NEXT:    andq %r13, %r10
-; X64-NEXT:    shlq $4, %r10
-; X64-NEXT:    movabsq $-1085102592571150096, %rax # imm = 0xF0F0F0F0F0F0F0F0
-; X64-NEXT:    andq %rax, %rbx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    bswapq %rdi
+; X64-NEXT:    movq %rdi, %rbx
 ; X64-NEXT:    shrq $4, %rbx
-; X64-NEXT:    orq %r10, %rbx
+; X64-NEXT:    movabsq $1085102592571150095, %r13 # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT:    andq %r13, %rbx
+; X64-NEXT:    andq %r13, %rdi
+; X64-NEXT:    shlq $4, %rdi
+; X64-NEXT:    orq %rbx, %rdi
 ; X64-NEXT:    movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333
-; X64-NEXT:    movq %rbx, %r10
-; X64-NEXT:    andq %r11, %r10
-; X64-NEXT:    movabsq $-3689348814741910324, %r14 # imm = 0xCCCCCCCCCCCCCCCC
-; X64-NEXT:    andq %r14, %rbx
-; X64-NEXT:    shrq $2, %rbx
-; X64-NEXT:    leaq (%rbx,%r10,4), %r10
-; X64-NEXT:    movabsq $6148820866244280320, %rbx # imm = 0x5555000000000000
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    andq %r11, %rbx
+; X64-NEXT:    shrq $2, %rdi
+; X64-NEXT:    andq %r11, %rdi
+; X64-NEXT:    leaq (%rdi,%rbx,4), %rdi
+; X64-NEXT:    movabsq $6148820866244280320, %r10 # imm = 0x5555000000000000
+; X64-NEXT:    movq %rdi, %rbx
 ; X64-NEXT:    andq %r10, %rbx
-; X64-NEXT:    movabsq $-6149102341220990976, %rdi # imm = 0xAAAA000000000000
-; X64-NEXT:    andq %r10, %rdi
 ; X64-NEXT:    shrq %rdi
-; X64-NEXT:    leaq (%rdi,%rbx,2), %rdi
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    andq %r10, %rdi
+; X64-NEXT:    leaq (%rdi,%rbx,2), %r10
 ; X64-NEXT:    bswapq %rbp
 ; X64-NEXT:    movq %rbp, %rdi
+; X64-NEXT:    shrq $4, %rdi
 ; X64-NEXT:    andq %r13, %rdi
-; X64-NEXT:    shlq $4, %rdi
-; X64-NEXT:    andq %rax, %rbp
-; X64-NEXT:    shrq $4, %rbp
+; X64-NEXT:    andq %r13, %rbp
+; X64-NEXT:    shlq $4, %rbp
 ; X64-NEXT:    orq %rdi, %rbp
 ; X64-NEXT:    movq %rbp, %rdi
 ; X64-NEXT:    andq %r11, %rdi
-; X64-NEXT:    andq %r14, %rbp
-; X64-NEXT:    shrq $2, %rbp
-; X64-NEXT:    leaq (%rbp,%rdi,4), %rbp
-; X64-NEXT:    movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555
-; X64-NEXT:    movq %rbp, %r10
-; X64-NEXT:    andq %rbx, %r10
-; X64-NEXT:    movabsq $-6148914691236517206, %rdi # imm = 0xAAAAAAAAAAAAAAAA
-; X64-NEXT:    andq %rdi, %rbp
-; X64-NEXT:    shrq %rbp
-; X64-NEXT:    leaq (%rbp,%r10,2), %rbp
-; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; X64-NEXT:    bswapq %rbp
-; X64-NEXT:    movq %rbp, %r10
-; X64-NEXT:    andq %r13, %r10
-; X64-NEXT:    shlq $4, %r10
-; X64-NEXT:    andq %rax, %rbp
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    shrq $4, %rbp
-; X64-NEXT:    orq %r10, %rbp
-; X64-NEXT:    movq %rbp, %r10
-; X64-NEXT:    andq %r11, %r10
-; X64-NEXT:    andq %r14, %rbp
 ; X64-NEXT:    shrq $2, %rbp
-; X64-NEXT:    leaq (%rbp,%r10,4), %rbp
-; X64-NEXT:    movq %rbp, %r10
-; X64-NEXT:    andq %rbx, %r10
-; X64-NEXT:    andq %rdi, %rbp
-; X64-NEXT:    shrq %rbp
-; X64-NEXT:    leaq (%rbp,%r10,2), %rbp
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; X64-NEXT:    bswapq %r10
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    andq %r13, %rax
-; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    andq %r11, %rbp
+; X64-NEXT:    leaq (%rbp,%rdi,4), %rdi
+; X64-NEXT:    movabsq $6148914691236517205, %rbp # imm = 0x5555555555555555
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    andq %rbp, %rbx
+; X64-NEXT:    shrq %rdi
+; X64-NEXT:    andq %rbp, %rdi
+; X64-NEXT:    leaq (%rdi,%rbx,2), %r14
+; X64-NEXT:    shrdq $48, %r14, %r10
+; X64-NEXT:    bswapq %r15
 ; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    andq %r15, %r10
-; X64-NEXT:    shrq $4, %r10
-; X64-NEXT:    orq %rax, %r10
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    andq %r11, %rax
-; X64-NEXT:    andq %r14, %r10
-; X64-NEXT:    shrq $2, %r10
-; X64-NEXT:    leaq (%r10,%rax,4), %rax
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    andq %rbx, %r10
-; X64-NEXT:    movabsq $-6148914691236517206, %r15 # imm = 0xAAAAAAAAAAAAAAAA
-; X64-NEXT:    andq %r15, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    leaq (%rax,%r10,2), %r10
+; X64-NEXT:    shrq $4, %rdi
+; X64-NEXT:    andq %r13, %rdi
+; X64-NEXT:    andq %r13, %r15
+; X64-NEXT:    shlq $4, %r15
+; X64-NEXT:    orq %rdi, %r15
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    andq %r11, %rdi
+; X64-NEXT:    shrq $2, %r15
+; X64-NEXT:    andq %r11, %r15
+; X64-NEXT:    leaq (%r15,%rdi,4), %rdi
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    andq %rbp, %rbx
+; X64-NEXT:    shrq %rdi
+; X64-NEXT:    andq %rbp, %rdi
+; X64-NEXT:    leaq (%rdi,%rbx,2), %r15
+; X64-NEXT:    shrdq $48, %r15, %r14
+; X64-NEXT:    bswapq %r12
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    shrq $4, %rdi
+; X64-NEXT:    andq %r13, %rdi
+; X64-NEXT:    andq %r13, %r12
+; X64-NEXT:    shlq $4, %r12
+; X64-NEXT:    orq %rdi, %r12
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    andq %r11, %rdi
+; X64-NEXT:    shrq $2, %r12
+; X64-NEXT:    andq %r11, %r12
+; X64-NEXT:    leaq (%r12,%rdi,4), %rdi
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    andq %rbp, %rbx
+; X64-NEXT:    shrq %rdi
+; X64-NEXT:    andq %rbp, %rdi
+; X64-NEXT:    leaq (%rdi,%rbx,2), %r12
+; X64-NEXT:    shrdq $48, %r12, %r15
 ; X64-NEXT:    bswapq %r9
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    andq %r13, %rax
-; X64-NEXT:    shlq $4, %rax
-; X64-NEXT:    andq %rdi, %r9
-; X64-NEXT:    shrq $4, %r9
-; X64-NEXT:    orq %rax, %r9
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    andq %r11, %rax
-; X64-NEXT:    andq %r14, %r9
+; X64-NEXT:    movq %r9, %rdi
+; X64-NEXT:    shrq $4, %rdi
+; X64-NEXT:    andq %r13, %rdi
+; X64-NEXT:    andq %r13, %r9
+; X64-NEXT:    shlq $4, %r9
+; X64-NEXT:    orq %rdi, %r9
+; X64-NEXT:    movq %r9, %rdi
+; X64-NEXT:    andq %r11, %rdi
 ; X64-NEXT:    shrq $2, %r9
-; X64-NEXT:    leaq (%r9,%rax,4), %rax
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    andq %rbx, %r9
-; X64-NEXT:    andq %r15, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    leaq (%rax,%r9,2), %r9
+; X64-NEXT:    andq %r11, %r9
+; X64-NEXT:    leaq (%r9,%rdi,4), %rdi
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    andq %rbp, %rbx
+; X64-NEXT:    shrq %rdi
+; X64-NEXT:    andq %rbp, %rdi
+; X64-NEXT:    leaq (%rdi,%rbx,2), %r9
+; X64-NEXT:    shrdq $48, %r9, %r12
 ; X64-NEXT:    bswapq %r8
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    andq %r13, %rax
-; X64-NEXT:    shlq $4, %rax
-; X64-NEXT:    andq %rdi, %r8
-; X64-NEXT:    shrq $4, %r8
-; X64-NEXT:    orq %rax, %r8
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    andq %r11, %rax
-; X64-NEXT:    andq %r14, %r8
+; X64-NEXT:    movq %r8, %rdi
+; X64-NEXT:    shrq $4, %rdi
+; X64-NEXT:    andq %r13, %rdi
+; X64-NEXT:    andq %r13, %r8
+; X64-NEXT:    shlq $4, %r8
+; X64-NEXT:    orq %rdi, %r8
+; X64-NEXT:    movq %r8, %rdi
+; X64-NEXT:    andq %r11, %rdi
 ; X64-NEXT:    shrq $2, %r8
-; X64-NEXT:    leaq (%r8,%rax,4), %rax
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    andq %rbx, %r8
-; X64-NEXT:    andq %r15, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    leaq (%rax,%r8,2), %r8
+; X64-NEXT:    andq %r11, %r8
+; X64-NEXT:    leaq (%r8,%rdi,4), %rdi
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    andq %rbp, %rbx
+; X64-NEXT:    shrq %rdi
+; X64-NEXT:    andq %rbp, %rdi
+; X64-NEXT:    leaq (%rdi,%rbx,2), %rdi
+; X64-NEXT:    shrdq $48, %rdi, %r9
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    andq %r13, %rax
-; X64-NEXT:    shlq $4, %rax
-; X64-NEXT:    andq %rdi, %rcx
-; X64-NEXT:    shrq $4, %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    andq %r11, %rax
-; X64-NEXT:    andq %r14, %rcx
+; X64-NEXT:    movq %rcx, %rbx
+; X64-NEXT:    shrq $4, %rbx
+; X64-NEXT:    andq %r13, %rbx
+; X64-NEXT:    andq %r13, %rcx
+; X64-NEXT:    shlq $4, %rcx
+; X64-NEXT:    orq %rbx, %rcx
+; X64-NEXT:    movq %rcx, %rbx
+; X64-NEXT:    andq %r11, %rbx
 ; X64-NEXT:    shrq $2, %rcx
-; X64-NEXT:    leaq (%rcx,%rax,4), %rax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    andq %rbx, %rcx
-; X64-NEXT:    andq %r15, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    leaq (%rax,%rcx,2), %rcx
+; X64-NEXT:    andq %r11, %rcx
+; X64-NEXT:    leaq (%rcx,%rbx,4), %rcx
+; X64-NEXT:    movq %rcx, %rbx
+; X64-NEXT:    andq %rbp, %rbx
+; X64-NEXT:    shrq %rcx
+; X64-NEXT:    andq %rbp, %rcx
+; X64-NEXT:    leaq (%rcx,%rbx,2), %rcx
+; X64-NEXT:    shrdq $48, %rcx, %rdi
 ; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    andq %r13, %rax
-; X64-NEXT:    shlq $4, %rax
-; X64-NEXT:    andq %rdi, %rdx
-; X64-NEXT:    shrq $4, %rdx
-; X64-NEXT:    orq %rax, %rdx
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    andq %r11, %rax
-; X64-NEXT:    andq %r14, %rdx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    shrq $4, %rbx
+; X64-NEXT:    andq %r13, %rbx
+; X64-NEXT:    andq %r13, %rdx
+; X64-NEXT:    shlq $4, %rdx
+; X64-NEXT:    orq %rbx, %rdx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    andq %r11, %rbx
 ; X64-NEXT:    shrq $2, %rdx
-; X64-NEXT:    leaq (%rdx,%rax,4), %rax
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    andq %rbx, %rdx
-; X64-NEXT:    andq %r15, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    leaq (%rax,%rdx,2), %rax
-; X64-NEXT:    bswapq %rsi
-; X64-NEXT:    andq %rsi, %r13
-; X64-NEXT:    andq %rdi, %rsi
-; X64-NEXT:    shlq $4, %r13
-; X64-NEXT:    shrq $4, %rsi
-; X64-NEXT:    orq %r13, %rsi
-; X64-NEXT:    andq %rsi, %r11
-; X64-NEXT:    andq %r14, %rsi
-; X64-NEXT:    shrq $2, %rsi
-; X64-NEXT:    leaq (%rsi,%r11,4), %rdx
-; X64-NEXT:    andq %rdx, %rbx
-; X64-NEXT:    andq %r15, %rdx
+; X64-NEXT:    andq %r11, %rdx
+; X64-NEXT:    leaq (%rdx,%rbx,4), %rdx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    andq %rbp, %rbx
 ; X64-NEXT:    shrq %rdx
+; X64-NEXT:    andq %rbp, %rdx
 ; X64-NEXT:    leaq (%rdx,%rbx,2), %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    shrdq $48, %rdi, %rsi
-; X64-NEXT:    shrdq $48, %rbp, %rdi
-; X64-NEXT:    shrdq $48, %r10, %rbp
-; X64-NEXT:    shrdq $48, %r9, %r10
-; X64-NEXT:    shrdq $48, %r8, %r9
-; X64-NEXT:    shrdq $48, %rcx, %r8
-; X64-NEXT:    shrdq $48, %rax, %rcx
-; X64-NEXT:    shrdq $48, %rdx, %rax
-; X64-NEXT:    movq %rax, 56(%r12)
-; X64-NEXT:    movq %rcx, 48(%r12)
-; X64-NEXT:    movq %r8, 40(%r12)
-; X64-NEXT:    movq %r9, 32(%r12)
-; X64-NEXT:    movq %r10, 24(%r12)
-; X64-NEXT:    movq %rbp, 16(%r12)
-; X64-NEXT:    movq %rdi, 8(%r12)
-; X64-NEXT:    movq %rsi, (%r12)
-; X64-NEXT:    shrq $48, %rdx
-; X64-NEXT:    movw %dx, 64(%r12)
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    shrdq $48, %rdx, %rcx
+; X64-NEXT:    bswapq %rsi
+; X64-NEXT:    movq %rsi, %rbx
+; X64-NEXT:    shrq $4, %rbx
+; X64-NEXT:    andq %r13, %rbx
+; X64-NEXT:    andq %r13, %rsi
+; X64-NEXT:    shlq $4, %rsi
+; X64-NEXT:    orq %rbx, %rsi
+; X64-NEXT:    movq %rsi, %rbx
+; X64-NEXT:    andq %r11, %rbx
+; X64-NEXT:    shrq $2, %rsi
+; X64-NEXT:    andq %r11, %rsi
+; X64-NEXT:    leaq (%rsi,%rbx,4), %rsi
+; X64-NEXT:    movq %rsi, %rbx
+; X64-NEXT:    andq %rbp, %rbx
+; X64-NEXT:    shrq %rsi
+; X64-NEXT:    andq %rbp, %rsi
+; X64-NEXT:    leaq (%rsi,%rbx,2), %rsi
+; X64-NEXT:    shrdq $48, %rsi, %rdx
+; X64-NEXT:    shrq $48, %rsi
+; X64-NEXT:    movq %rdx, 56(%rax)
+; X64-NEXT:    movq %rcx, 48(%rax)
+; X64-NEXT:    movq %rdi, 40(%rax)
+; X64-NEXT:    movq %r9, 32(%rax)
+; X64-NEXT:    movq %r12, 24(%rax)
+; X64-NEXT:    movq %r15, 16(%rax)
+; X64-NEXT:    movq %r14, 8(%rax)
+; X64-NEXT:    movq %r10, (%rax)
+; X64-NEXT:    movw %si, 64(%rax)
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13

diff  --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll
index 8c41f533fd6b2..4a50f7c879adb 100644
--- a/llvm/test/CodeGen/X86/combine-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll
@@ -55,16 +55,18 @@ define <4 x i32> @test_demandedbits_bitreverse(<4 x i32> %a0) nounwind {
 ; X86-NEXT:    psrlw $4, %xmm0
 ; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    por %xmm1, %xmm0
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; X86-NEXT:    pand %xmm0, %xmm1
-; X86-NEXT:    psllw $2, %xmm1
-; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT:    psrlw $2, %xmm0
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrlw $2, %xmm1
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X86-NEXT:    pand %xmm2, %xmm1
+; X86-NEXT:    pand %xmm2, %xmm0
+; X86-NEXT:    psllw $2, %xmm0
 ; X86-NEXT:    por %xmm1, %xmm0
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; X86-NEXT:    pand %xmm0, %xmm1
+; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    psrlw $1, %xmm1
-; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; X86-NEXT:    pand %xmm2, %xmm1
+; X86-NEXT:    pand %xmm2, %xmm0
 ; X86-NEXT:    paddb %xmm0, %xmm0
 ; X86-NEXT:    por %xmm1, %xmm0
 ; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0

diff  --git a/llvm/test/CodeGen/X86/pr43820.ll b/llvm/test/CodeGen/X86/pr43820.ll
index 5bdf7872d61a0..2cbced7053e87 100644
--- a/llvm/test/CodeGen/X86/pr43820.ll
+++ b/llvm/test/CodeGen/X86/pr43820.ll
@@ -10,363 +10,362 @@ define i1000 @square(i1000 %A) nounwind {
 ; CHECK-NEXT:    pushq %r13
 ; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r15
 ; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; CHECK-NEXT:    bswapq %rbp
+; CHECK-NEXT:    movq %rbp, %r11
+; CHECK-NEXT:    shrq $4, %r11
+; CHECK-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; CHECK-NEXT:    andq %rsi, %r11
+; CHECK-NEXT:    andq %rsi, %rbp
+; CHECK-NEXT:    shlq $4, %rbp
+; CHECK-NEXT:    orq %r11, %rbp
+; CHECK-NEXT:    movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333
+; CHECK-NEXT:    movq %rbp, %r12
+; CHECK-NEXT:    andq %rdi, %r12
+; CHECK-NEXT:    shrq $2, %rbp
+; CHECK-NEXT:    andq %rdi, %rbp
+; CHECK-NEXT:    leaq (%rbp,%r12,4), %rbp
+; CHECK-NEXT:    movabsq $6148914691230924800, %r12 # imm = 0x5555555555000000
+; CHECK-NEXT:    movq %rbp, %r13
+; CHECK-NEXT:    andq %r12, %r13
+; CHECK-NEXT:    shrq %rbp
+; CHECK-NEXT:    andq %r12, %rbp
+; CHECK-NEXT:    leaq (%rbp,%r13,2), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    bswapq %rbx
-; CHECK-NEXT:    movabsq $1085102592571150095, %rdi # imm = 0xF0F0F0F0F0F0F0F
 ; CHECK-NEXT:    movq %rbx, %rbp
-; CHECK-NEXT:    andq %rdi, %rbp
-; CHECK-NEXT:    shlq $4, %rbp
-; CHECK-NEXT:    movabsq $-1085102592571150096, %r11 # imm = 0xF0F0F0F0F0F0F0F0
-; CHECK-NEXT:    andq %r11, %rbx
-; CHECK-NEXT:    movq %r11, %rax
-; CHECK-NEXT:    shrq $4, %rbx
+; CHECK-NEXT:    shrq $4, %rbp
+; CHECK-NEXT:    andq %rsi, %rbp
+; CHECK-NEXT:    andq %rsi, %rbx
+; CHECK-NEXT:    shlq $4, %rbx
 ; CHECK-NEXT:    orq %rbp, %rbx
-; CHECK-NEXT:    movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333
-; CHECK-NEXT:    movq %rbx, %r14
-; CHECK-NEXT:    andq %r11, %r14
-; CHECK-NEXT:    movabsq $-3689348814741910324, %rbp # imm = 0xCCCCCCCCCCCCCCCC
-; CHECK-NEXT:    andq %rbp, %rbx
-; CHECK-NEXT:    movq %rbp, %r15
+; CHECK-NEXT:    movq %rbx, %rbp
+; CHECK-NEXT:    andq %rdi, %rbp
 ; CHECK-NEXT:    shrq $2, %rbx
-; CHECK-NEXT:    leaq (%rbx,%r14,4), %r14
-; CHECK-NEXT:    movabsq $6148914691230924800, %rbx # imm = 0x5555555555000000
-; CHECK-NEXT:    andq %r14, %rbx
-; CHECK-NEXT:    movabsq $-6148914691247702016, %rbp # imm = 0xAAAAAAAAAA000000
-; CHECK-NEXT:    andq %r14, %rbp
-; CHECK-NEXT:    shrq %rbp
-; CHECK-NEXT:    leaq (%rbp,%rbx,2), %rbx
-; CHECK-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    bswapq %r10
-; CHECK-NEXT:    movq %r10, %rbx
 ; CHECK-NEXT:    andq %rdi, %rbx
-; CHECK-NEXT:    shlq $4, %rbx
-; CHECK-NEXT:    andq %rax, %r10
-; CHECK-NEXT:    shrq $4, %r10
-; CHECK-NEXT:    orq %rbx, %r10
-; CHECK-NEXT:    movq %r10, %rbx
-; CHECK-NEXT:    andq %r11, %rbx
-; CHECK-NEXT:    andq %r15, %r10
-; CHECK-NEXT:    shrq $2, %r10
-; CHECK-NEXT:    leaq (%r10,%rbx,4), %rbp
+; CHECK-NEXT:    leaq (%rbx,%rbp,4), %rbp
 ; CHECK-NEXT:    movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %rbx, %r10
-; CHECK-NEXT:    movabsq $-6148914691236517206, %r13 # imm = 0xAAAAAAAAAAAAAAAA
-; CHECK-NEXT:    andq %r13, %rbp
+; CHECK-NEXT:    movq %rbp, %r12
+; CHECK-NEXT:    andq %rbx, %r12
 ; CHECK-NEXT:    shrq %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT:    bswapq %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %rdi, %r10
-; CHECK-NEXT:    shlq $4, %r10
-; CHECK-NEXT:    andq %rax, %rbp
+; CHECK-NEXT:    andq %rbx, %rbp
+; CHECK-NEXT:    leaq (%rbp,%r12,2), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    bswapq %r15
+; CHECK-NEXT:    movq %r15, %rbp
 ; CHECK-NEXT:    shrq $4, %rbp
-; CHECK-NEXT:    orq %r10, %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %r11, %r10
-; CHECK-NEXT:    andq %r15, %rbp
-; CHECK-NEXT:    shrq $2, %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %rbx, %r10
-; CHECK-NEXT:    andq %r13, %rbp
+; CHECK-NEXT:    andq %rsi, %rbp
+; CHECK-NEXT:    andq %rsi, %r15
+; CHECK-NEXT:    shlq $4, %r15
+; CHECK-NEXT:    orq %rbp, %r15
+; CHECK-NEXT:    movq %r15, %rbp
+; CHECK-NEXT:    andq %rdi, %rbp
+; CHECK-NEXT:    shrq $2, %r15
+; CHECK-NEXT:    andq %rdi, %r15
+; CHECK-NEXT:    leaq (%r15,%rbp,4), %rbp
+; CHECK-NEXT:    movq %rbp, %r15
+; CHECK-NEXT:    andq %rbx, %r15
 ; CHECK-NEXT:    shrq %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT:    bswapq %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %rdi, %r10
-; CHECK-NEXT:    shlq $4, %r10
-; CHECK-NEXT:    andq %rax, %rbp
+; CHECK-NEXT:    andq %rbx, %rbp
+; CHECK-NEXT:    leaq (%rbp,%r15,2), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    bswapq %r14
+; CHECK-NEXT:    movq %r14, %rbp
 ; CHECK-NEXT:    shrq $4, %rbp
-; CHECK-NEXT:    orq %r10, %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %r11, %r10
-; CHECK-NEXT:    andq %r15, %rbp
-; CHECK-NEXT:    shrq $2, %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %rbx, %r10
-; CHECK-NEXT:    andq %r13, %rbp
+; CHECK-NEXT:    andq %rsi, %rbp
+; CHECK-NEXT:    andq %rsi, %r14
+; CHECK-NEXT:    shlq $4, %r14
+; CHECK-NEXT:    orq %rbp, %r14
+; CHECK-NEXT:    movq %r14, %rbp
+; CHECK-NEXT:    andq %rdi, %rbp
+; CHECK-NEXT:    shrq $2, %r14
+; CHECK-NEXT:    andq %rdi, %r14
+; CHECK-NEXT:    leaq (%r14,%rbp,4), %rbp
+; CHECK-NEXT:    movq %rbp, %r14
+; CHECK-NEXT:    andq %rbx, %r14
 ; CHECK-NEXT:    shrq %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT:    bswapq %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %rdi, %r10
-; CHECK-NEXT:    shlq $4, %r10
-; CHECK-NEXT:    andq %rax, %rbp
+; CHECK-NEXT:    andq %rbx, %rbp
+; CHECK-NEXT:    leaq (%rbp,%r14,2), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    bswapq %r10
+; CHECK-NEXT:    movq %r10, %rbp
 ; CHECK-NEXT:    shrq $4, %rbp
-; CHECK-NEXT:    orq %r10, %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %r11, %r10
-; CHECK-NEXT:    andq %r15, %rbp
-; CHECK-NEXT:    shrq $2, %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %rbx, %r10
-; CHECK-NEXT:    andq %r13, %rbp
-; CHECK-NEXT:    shrq %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT:    bswapq %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %rdi, %r10
+; CHECK-NEXT:    andq %rsi, %rbp
+; CHECK-NEXT:    andq %rsi, %r10
 ; CHECK-NEXT:    shlq $4, %r10
-; CHECK-NEXT:    andq %rax, %rbp
-; CHECK-NEXT:    movq %rax, %r14
-; CHECK-NEXT:    shrq $4, %rbp
-; CHECK-NEXT:    orq %r10, %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %r11, %r10
-; CHECK-NEXT:    andq %r15, %rbp
-; CHECK-NEXT:    shrq $2, %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,4), %rbp
+; CHECK-NEXT:    orq %rbp, %r10
+; CHECK-NEXT:    movq %r10, %rbp
+; CHECK-NEXT:    andq %rdi, %rbp
+; CHECK-NEXT:    shrq $2, %r10
+; CHECK-NEXT:    andq %rdi, %r10
+; CHECK-NEXT:    leaq (%r10,%rbp,4), %rbp
 ; CHECK-NEXT:    movq %rbp, %r10
 ; CHECK-NEXT:    andq %rbx, %r10
-; CHECK-NEXT:    andq %r13, %rbp
 ; CHECK-NEXT:    shrq %rbp
+; CHECK-NEXT:    andq %rbx, %rbp
 ; CHECK-NEXT:    leaq (%rbp,%r10,2), %rax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
 ; CHECK-NEXT:    bswapq %rbp
 ; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %rdi, %r10
-; CHECK-NEXT:    shlq $4, %r10
-; CHECK-NEXT:    andq %r14, %rbp
-; CHECK-NEXT:    shrq $4, %rbp
+; CHECK-NEXT:    shrq $4, %r10
+; CHECK-NEXT:    andq %rsi, %r10
+; CHECK-NEXT:    andq %rsi, %rbp
+; CHECK-NEXT:    shlq $4, %rbp
 ; CHECK-NEXT:    orq %r10, %rbp
 ; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %r11, %r10
-; CHECK-NEXT:    andq %r15, %rbp
-; CHECK-NEXT:    shrq $2, %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %rbx, %r10
-; CHECK-NEXT:    andq %r13, %rbp
-; CHECK-NEXT:    shrq %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT:    bswapq %rbp
-; CHECK-NEXT:    movq %rbp, %r10
 ; CHECK-NEXT:    andq %rdi, %r10
-; CHECK-NEXT:    shlq $4, %r10
-; CHECK-NEXT:    andq %r14, %rbp
-; CHECK-NEXT:    shrq $4, %rbp
-; CHECK-NEXT:    orq %r10, %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %r11, %r10
-; CHECK-NEXT:    andq %r15, %rbp
 ; CHECK-NEXT:    shrq $2, %rbp
+; CHECK-NEXT:    andq %rdi, %rbp
 ; CHECK-NEXT:    leaq (%rbp,%r10,4), %rbp
 ; CHECK-NEXT:    movq %rbp, %r10
 ; CHECK-NEXT:    andq %rbx, %r10
-; CHECK-NEXT:    andq %r13, %rbp
 ; CHECK-NEXT:    shrq %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT:    bswapq %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %rdi, %r10
+; CHECK-NEXT:    andq %rbx, %rbp
+; CHECK-NEXT:    leaq (%rbp,%r10,2), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT:    bswapq %r10
+; CHECK-NEXT:    movq %r10, %r14
+; CHECK-NEXT:    shrq $4, %r14
+; CHECK-NEXT:    andq %rsi, %r14
+; CHECK-NEXT:    andq %rsi, %r10
 ; CHECK-NEXT:    shlq $4, %r10
-; CHECK-NEXT:    andq %r14, %rbp
-; CHECK-NEXT:    shrq $4, %rbp
-; CHECK-NEXT:    orq %r10, %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %r11, %r10
-; CHECK-NEXT:    andq %r15, %rbp
-; CHECK-NEXT:    shrq $2, %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT:    movq %rbp, %r10
+; CHECK-NEXT:    orq %r14, %r10
+; CHECK-NEXT:    movq %r10, %r14
+; CHECK-NEXT:    andq %rdi, %r14
+; CHECK-NEXT:    shrq $2, %r10
+; CHECK-NEXT:    andq %rdi, %r10
+; CHECK-NEXT:    movq %rdi, %rbp
+; CHECK-NEXT:    leaq (%r10,%r14,4), %r10
+; CHECK-NEXT:    movq %r10, %r14
+; CHECK-NEXT:    andq %rbx, %r14
+; CHECK-NEXT:    shrq %r10
 ; CHECK-NEXT:    andq %rbx, %r10
-; CHECK-NEXT:    andq %r13, %rbp
-; CHECK-NEXT:    shrq %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT:    bswapq %rbp
-; CHECK-NEXT:    movq %rbp, %r10
+; CHECK-NEXT:    leaq (%r10,%r14,2), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT:    bswapq %r10
+; CHECK-NEXT:    movq %r10, %r14
+; CHECK-NEXT:    shrq $4, %r14
+; CHECK-NEXT:    andq %rsi, %r14
+; CHECK-NEXT:    andq %rsi, %r10
+; CHECK-NEXT:    shlq $4, %r10
+; CHECK-NEXT:    orq %r14, %r10
+; CHECK-NEXT:    movq %r10, %r14
+; CHECK-NEXT:    andq %rdi, %r14
+; CHECK-NEXT:    shrq $2, %r10
 ; CHECK-NEXT:    andq %rdi, %r10
+; CHECK-NEXT:    leaq (%r10,%r14,4), %r10
+; CHECK-NEXT:    movq %r10, %r14
+; CHECK-NEXT:    andq %rbx, %r14
+; CHECK-NEXT:    shrq %r10
+; CHECK-NEXT:    andq %rbx, %r10
+; CHECK-NEXT:    leaq (%r10,%r14,2), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT:    bswapq %r10
+; CHECK-NEXT:    movq %r10, %r14
+; CHECK-NEXT:    shrq $4, %r14
+; CHECK-NEXT:    andq %rsi, %r14
+; CHECK-NEXT:    andq %rsi, %r10
 ; CHECK-NEXT:    shlq $4, %r10
-; CHECK-NEXT:    andq %r14, %rbp
-; CHECK-NEXT:    shrq $4, %rbp
-; CHECK-NEXT:    orq %r10, %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %r11, %r10
-; CHECK-NEXT:    andq %r15, %rbp
-; CHECK-NEXT:    shrq $2, %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT:    movq %rbp, %r10
+; CHECK-NEXT:    orq %r14, %r10
+; CHECK-NEXT:    movq %r10, %r14
+; CHECK-NEXT:    andq %rdi, %r14
+; CHECK-NEXT:    shrq $2, %r10
+; CHECK-NEXT:    andq %rdi, %r10
+; CHECK-NEXT:    leaq (%r10,%r14,4), %r10
+; CHECK-NEXT:    movq %r10, %r14
+; CHECK-NEXT:    andq %rbx, %r14
+; CHECK-NEXT:    shrq %r10
 ; CHECK-NEXT:    andq %rbx, %r10
-; CHECK-NEXT:    andq %r13, %rbp
-; CHECK-NEXT:    shrq %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT:    bswapq %rbp
-; CHECK-NEXT:    movq %rbp, %r10
+; CHECK-NEXT:    leaq (%r10,%r14,2), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT:    bswapq %r10
+; CHECK-NEXT:    movq %r10, %r14
+; CHECK-NEXT:    shrq $4, %r14
+; CHECK-NEXT:    andq %rsi, %r14
+; CHECK-NEXT:    andq %rsi, %r10
+; CHECK-NEXT:    shlq $4, %r10
+; CHECK-NEXT:    orq %r14, %r10
+; CHECK-NEXT:    movq %r10, %r14
+; CHECK-NEXT:    andq %rdi, %r14
+; CHECK-NEXT:    shrq $2, %r10
 ; CHECK-NEXT:    andq %rdi, %r10
+; CHECK-NEXT:    leaq (%r10,%r14,4), %r10
+; CHECK-NEXT:    movq %r10, %r14
+; CHECK-NEXT:    andq %rbx, %r14
+; CHECK-NEXT:    shrq %r10
+; CHECK-NEXT:    andq %rbx, %r10
+; CHECK-NEXT:    leaq (%r10,%r14,2), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT:    bswapq %r10
+; CHECK-NEXT:    movq %r10, %rax
+; CHECK-NEXT:    shrq $4, %rax
+; CHECK-NEXT:    andq %rsi, %rax
+; CHECK-NEXT:    andq %rsi, %r10
 ; CHECK-NEXT:    shlq $4, %r10
-; CHECK-NEXT:    andq %r14, %rbp
-; CHECK-NEXT:    shrq $4, %rbp
-; CHECK-NEXT:    orq %r10, %rbp
-; CHECK-NEXT:    movq %rbp, %r10
-; CHECK-NEXT:    andq %r11, %r10
-; CHECK-NEXT:    andq %r15, %rbp
-; CHECK-NEXT:    shrq $2, %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT:    movq %rbp, %r10
+; CHECK-NEXT:    orq %rax, %r10
+; CHECK-NEXT:    movq %r10, %rax
+; CHECK-NEXT:    andq %rdi, %rax
+; CHECK-NEXT:    shrq $2, %r10
+; CHECK-NEXT:    andq %rdi, %r10
+; CHECK-NEXT:    leaq (%r10,%rax,4), %rax
+; CHECK-NEXT:    movq %rax, %r10
 ; CHECK-NEXT:    andq %rbx, %r10
-; CHECK-NEXT:    andq %r13, %rbp
-; CHECK-NEXT:    shrq %rbp
-; CHECK-NEXT:    leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    shrq %rax
+; CHECK-NEXT:    andq %rbx, %rax
+; CHECK-NEXT:    leaq (%rax,%r10,2), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    bswapq %r9
-; CHECK-NEXT:    movq %r9, %rbp
-; CHECK-NEXT:    andq %rdi, %rbp
-; CHECK-NEXT:    shlq $4, %rbp
-; CHECK-NEXT:    andq %r14, %r9
-; CHECK-NEXT:    shrq $4, %r9
-; CHECK-NEXT:    orq %rbp, %r9
-; CHECK-NEXT:    movq %r9, %rbp
-; CHECK-NEXT:    andq %r11, %rbp
-; CHECK-NEXT:    andq %r15, %r9
+; CHECK-NEXT:    movq %r9, %rax
+; CHECK-NEXT:    shrq $4, %rax
+; CHECK-NEXT:    andq %rsi, %rax
+; CHECK-NEXT:    andq %rsi, %r9
+; CHECK-NEXT:    shlq $4, %r9
+; CHECK-NEXT:    orq %rax, %r9
+; CHECK-NEXT:    movq %r9, %rax
+; CHECK-NEXT:    andq %rdi, %rax
 ; CHECK-NEXT:    shrq $2, %r9
-; CHECK-NEXT:    leaq (%r9,%rbp,4), %rbp
-; CHECK-NEXT:    movq %rbp, %r9
+; CHECK-NEXT:    andq %rdi, %r9
+; CHECK-NEXT:    leaq (%r9,%rax,4), %rax
+; CHECK-NEXT:    movq %rax, %r9
 ; CHECK-NEXT:    andq %rbx, %r9
-; CHECK-NEXT:    andq %r13, %rbp
-; CHECK-NEXT:    shrq %rbp
-; CHECK-NEXT:    leaq (%rbp,%r9,2), %rbp
-; CHECK-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    shrq %rax
+; CHECK-NEXT:    andq %rbx, %rax
+; CHECK-NEXT:    leaq (%rax,%r9,2), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    bswapq %r8
-; CHECK-NEXT:    movq %r8, %rbp
-; CHECK-NEXT:    andq %rdi, %rbp
-; CHECK-NEXT:    shlq $4, %rbp
-; CHECK-NEXT:    andq %r14, %r8
-; CHECK-NEXT:    shrq $4, %r8
-; CHECK-NEXT:    orq %rbp, %r8
-; CHECK-NEXT:    movq %r8, %rbp
-; CHECK-NEXT:    andq %r11, %rbp
-; CHECK-NEXT:    andq %r15, %r8
-; CHECK-NEXT:    movq %r15, %r9
+; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    shrq $4, %rax
+; CHECK-NEXT:    andq %rsi, %rax
+; CHECK-NEXT:    andq %rsi, %r8
+; CHECK-NEXT:    shlq $4, %r8
+; CHECK-NEXT:    orq %rax, %r8
+; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    andq %rdi, %rax
 ; CHECK-NEXT:    shrq $2, %r8
-; CHECK-NEXT:    leaq (%r8,%rbp,4), %rbp
-; CHECK-NEXT:    movq %rbp, %r8
+; CHECK-NEXT:    andq %rdi, %r8
+; CHECK-NEXT:    leaq (%r8,%rax,4), %rax
+; CHECK-NEXT:    movq %rax, %r8
 ; CHECK-NEXT:    andq %rbx, %r8
-; CHECK-NEXT:    andq %r13, %rbp
-; CHECK-NEXT:    shrq %rbp
-; CHECK-NEXT:    leaq (%rbp,%r8,2), %rbp
-; CHECK-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    shrq %rax
+; CHECK-NEXT:    andq %rbx, %rax
+; CHECK-NEXT:    leaq (%rax,%r8,2), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    bswapq %rcx
-; CHECK-NEXT:    movq %rcx, %rbp
-; CHECK-NEXT:    andq %rdi, %rbp
-; CHECK-NEXT:    shlq $4, %rbp
-; CHECK-NEXT:    andq %r14, %rcx
-; CHECK-NEXT:    shrq $4, %rcx
-; CHECK-NEXT:    orq %rbp, %rcx
-; CHECK-NEXT:    movq %rcx, %rbp
-; CHECK-NEXT:    andq %r11, %rbp
-; CHECK-NEXT:    andq %r15, %rcx
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    shrq $4, %rax
+; CHECK-NEXT:    andq %rsi, %rax
+; CHECK-NEXT:    andq %rsi, %rcx
+; CHECK-NEXT:    shlq $4, %rcx
+; CHECK-NEXT:    orq %rax, %rcx
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    andq %rdi, %rax
 ; CHECK-NEXT:    shrq $2, %rcx
-; CHECK-NEXT:    leaq (%rcx,%rbp,4), %rcx
-; CHECK-NEXT:    movq %rcx, %rbp
-; CHECK-NEXT:    andq %rbx, %rbp
-; CHECK-NEXT:    andq %r13, %rcx
-; CHECK-NEXT:    shrq %rcx
-; CHECK-NEXT:    leaq (%rcx,%rbp,2), %r15
+; CHECK-NEXT:    andq %rdi, %rcx
+; CHECK-NEXT:    leaq (%rcx,%rax,4), %rax
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    andq %rbx, %rcx
+; CHECK-NEXT:    shrq %rax
+; CHECK-NEXT:    andq %rbx, %rax
+; CHECK-NEXT:    leaq (%rax,%rcx,2), %r12
 ; CHECK-NEXT:    bswapq %rdx
-; CHECK-NEXT:    movq %rdx, %rbp
-; CHECK-NEXT:    andq %rdi, %rbp
-; CHECK-NEXT:    shlq $4, %rbp
-; CHECK-NEXT:    andq %r14, %rdx
-; CHECK-NEXT:    shrq $4, %rdx
-; CHECK-NEXT:    orq %rbp, %rdx
-; CHECK-NEXT:    movq %rdx, %rbp
-; CHECK-NEXT:    andq %r11, %rbp
-; CHECK-NEXT:    andq %r9, %rdx
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    shrq $4, %rax
+; CHECK-NEXT:    andq %rsi, %rax
+; CHECK-NEXT:    andq %rsi, %rdx
+; CHECK-NEXT:    shlq $4, %rdx
+; CHECK-NEXT:    orq %rax, %rdx
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    andq %rdi, %rax
 ; CHECK-NEXT:    shrq $2, %rdx
-; CHECK-NEXT:    leaq (%rdx,%rbp,4), %rdx
-; CHECK-NEXT:    movq %rdx, %rbp
-; CHECK-NEXT:    andq %rbx, %rbp
-; CHECK-NEXT:    andq %r13, %rdx
-; CHECK-NEXT:    shrq %rdx
-; CHECK-NEXT:    leaq (%rdx,%rbp,2), %rdx
-; CHECK-NEXT:    bswapq %rsi
-; CHECK-NEXT:    andq %rsi, %rdi
-; CHECK-NEXT:    andq %r14, %rsi
-; CHECK-NEXT:    shlq $4, %rdi
-; CHECK-NEXT:    shrq $4, %rsi
-; CHECK-NEXT:    orq %rdi, %rsi
-; CHECK-NEXT:    andq %rsi, %r11
-; CHECK-NEXT:    andq %r9, %rsi
-; CHECK-NEXT:    shrq $2, %rsi
-; CHECK-NEXT:    leaq (%rsi,%r11,4), %rsi
-; CHECK-NEXT:    andq %rsi, %rbx
-; CHECK-NEXT:    andq %r13, %rsi
-; CHECK-NEXT:    shrq %rsi
-; CHECK-NEXT:    leaq (%rsi,%rbx,2), %r13
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; CHECK-NEXT:    andq %rdi, %rdx
+; CHECK-NEXT:    leaq (%rdx,%rax,4), %rax
+; CHECK-NEXT:    movq %rax, %rdx
+; CHECK-NEXT:    andq %rbx, %rdx
+; CHECK-NEXT:    shrq %rax
+; CHECK-NEXT:    andq %rbx, %rax
+; CHECK-NEXT:    leaq (%rax,%rdx,2), %rdi
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT:    shrdq $24, %rax, %r11
+; CHECK-NEXT:    bswapq %rax
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    shrq $4, %rcx
+; CHECK-NEXT:    andq %rsi, %rcx
+; CHECK-NEXT:    andq %rsi, %rax
+; CHECK-NEXT:    shlq $4, %rax
+; CHECK-NEXT:    orq %rcx, %rax
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    andq %rbp, %rcx
+; CHECK-NEXT:    shrq $2, %rax
+; CHECK-NEXT:    andq %rbp, %rax
+; CHECK-NEXT:    leaq (%rax,%rcx,4), %rax
+; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    andq %rbx, %rsi
+; CHECK-NEXT:    shrq %rax
+; CHECK-NEXT:    andq %rbx, %rax
+; CHECK-NEXT:    leaq (%rax,%rsi,2), %rsi
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT:    shrdq $24, %rax, %rdx
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; CHECK-NEXT:    shrdq $24, %rcx, %rax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; CHECK-NEXT:    shrdq $24, %rbp, %rcx
 ; CHECK-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; CHECK-NEXT:    shrdq $24, %r12, %rbp
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; CHECK-NEXT:    shrdq $24, %r13, %rbp
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; CHECK-NEXT:    shrdq $24, %r15, %r13
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; CHECK-NEXT:    shrdq $24, %r14, %r12
+; CHECK-NEXT:    shrdq $24, %r14, %r15
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
 ; CHECK-NEXT:    shrdq $24, %rbx, %r14
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; CHECK-NEXT:    shrdq $24, %r11, %rbx
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; CHECK-NEXT:    shrdq $24, %r10, %rbx
+; CHECK-NEXT:    shrdq $24, %r10, %r11
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
 ; CHECK-NEXT:    shrdq $24, %r9, %r10
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; CHECK-NEXT:    shrdq $24, %r8, %r9
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; CHECK-NEXT:    shrdq $24, %rdi, %r8
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; CHECK-NEXT:    shrdq $24, %rsi, %rdi
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT:    shrdq $24, %rax, %rsi
-; CHECK-NEXT:    shrdq $24, %r15, %rax
+; CHECK-NEXT:    shrdq $24, %rax, %r8
+; CHECK-NEXT:    shrdq $24, %r12, %rax
 ; CHECK-NEXT:    movq %rax, %rcx
-; CHECK-NEXT:    shrdq $24, %rdx, %r15
-; CHECK-NEXT:    shrdq $24, %r13, %rdx
+; CHECK-NEXT:    shrdq $24, %rdi, %r12
+; CHECK-NEXT:    shrdq $24, %rsi, %rdi
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT:    movq %rdx, 112(%rax)
-; CHECK-NEXT:    movq %r15, 104(%rax)
+; CHECK-NEXT:    movq %rdi, 112(%rax)
+; CHECK-NEXT:    movq %r12, 104(%rax)
 ; CHECK-NEXT:    movq %rcx, 96(%rax)
-; CHECK-NEXT:    movq %rsi, 88(%rax)
-; CHECK-NEXT:    movq %rdi, 80(%rax)
-; CHECK-NEXT:    movq %r8, 72(%rax)
-; CHECK-NEXT:    movq %r9, 64(%rax)
-; CHECK-NEXT:    movq %r10, 56(%rax)
-; CHECK-NEXT:    movq %rbx, 48(%rax)
-; CHECK-NEXT:    movq %r14, 40(%rax)
-; CHECK-NEXT:    movq %r12, 32(%rax)
+; CHECK-NEXT:    movq %r8, 88(%rax)
+; CHECK-NEXT:    movq %r9, 80(%rax)
+; CHECK-NEXT:    movq %r10, 72(%rax)
+; CHECK-NEXT:    movq %r11, 64(%rax)
+; CHECK-NEXT:    movq %rbx, 56(%rax)
+; CHECK-NEXT:    movq %r14, 48(%rax)
+; CHECK-NEXT:    movq %r15, 40(%rax)
+; CHECK-NEXT:    movq %r13, 32(%rax)
 ; CHECK-NEXT:    movq %rbp, 24(%rax)
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; CHECK-NEXT:    movq %rcx, 16(%rax)
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; CHECK-NEXT:    movq %rcx, 8(%rax)
-; CHECK-NEXT:    movq %r11, (%rax)
-; CHECK-NEXT:    movq %r13, %rcx
-; CHECK-NEXT:    shrq $56, %r13
-; CHECK-NEXT:    movb %r13b, 124(%rax)
+; CHECK-NEXT:    movq %rdx, (%rax)
+; CHECK-NEXT:    movq %rsi, %rcx
+; CHECK-NEXT:    shrq $56, %rsi
+; CHECK-NEXT:    movb %sil, 124(%rax)
 ; CHECK-NEXT:    shrq $24, %rcx
 ; CHECK-NEXT:    movl %ecx, 120(%rax)
 ; CHECK-NEXT:    popq %rbx

diff  --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index 651418d271be5..3555312b18a1a 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -24,14 +24,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; SSE-NEXT:    movl %edi, %eax
 ; SSE-NEXT:    andb $51, %al
 ; SSE-NEXT:    shlb $2, %al
-; SSE-NEXT:    andb $-52, %dil
 ; SSE-NEXT:    shrb $2, %dil
+; SSE-NEXT:    andb $51, %dil
 ; SSE-NEXT:    orb %al, %dil
 ; SSE-NEXT:    movl %edi, %eax
 ; SSE-NEXT:    andb $85, %al
 ; SSE-NEXT:    addb %al, %al
-; SSE-NEXT:    andb $-86, %dil
 ; SSE-NEXT:    shrb %dil
+; SSE-NEXT:    andb $85, %dil
 ; SSE-NEXT:    addl %edi, %eax
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE-NEXT:    retq
@@ -43,14 +43,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; AVX-NEXT:    movl %edi, %eax
 ; AVX-NEXT:    andb $51, %al
 ; AVX-NEXT:    shlb $2, %al
-; AVX-NEXT:    andb $-52, %dil
 ; AVX-NEXT:    shrb $2, %dil
+; AVX-NEXT:    andb $51, %dil
 ; AVX-NEXT:    orb %al, %dil
 ; AVX-NEXT:    movl %edi, %eax
 ; AVX-NEXT:    andb $85, %al
 ; AVX-NEXT:    addb %al, %al
-; AVX-NEXT:    andb $-86, %dil
 ; AVX-NEXT:    shrb %dil
+; AVX-NEXT:    andb $85, %dil
 ; AVX-NEXT:    addl %edi, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
@@ -70,14 +70,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; GFNISSE-NEXT:    movl %edi, %eax
 ; GFNISSE-NEXT:    andb $51, %al
 ; GFNISSE-NEXT:    shlb $2, %al
-; GFNISSE-NEXT:    andb $-52, %dil
 ; GFNISSE-NEXT:    shrb $2, %dil
+; GFNISSE-NEXT:    andb $51, %dil
 ; GFNISSE-NEXT:    orb %al, %dil
 ; GFNISSE-NEXT:    movl %edi, %eax
 ; GFNISSE-NEXT:    andb $85, %al
 ; GFNISSE-NEXT:    addb %al, %al
-; GFNISSE-NEXT:    andb $-86, %dil
 ; GFNISSE-NEXT:    shrb %dil
+; GFNISSE-NEXT:    andb $85, %dil
 ; GFNISSE-NEXT:    addl %edi, %eax
 ; GFNISSE-NEXT:    # kill: def $al killed $al killed $eax
 ; GFNISSE-NEXT:    retq
@@ -89,14 +89,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; GFNIAVX-NEXT:    movl %edi, %eax
 ; GFNIAVX-NEXT:    andb $51, %al
 ; GFNIAVX-NEXT:    shlb $2, %al
-; GFNIAVX-NEXT:    andb $-52, %dil
 ; GFNIAVX-NEXT:    shrb $2, %dil
+; GFNIAVX-NEXT:    andb $51, %dil
 ; GFNIAVX-NEXT:    orb %al, %dil
 ; GFNIAVX-NEXT:    movl %edi, %eax
 ; GFNIAVX-NEXT:    andb $85, %al
 ; GFNIAVX-NEXT:    addb %al, %al
-; GFNIAVX-NEXT:    andb $-86, %dil
 ; GFNIAVX-NEXT:    shrb %dil
+; GFNIAVX-NEXT:    andb $85, %dil
 ; GFNIAVX-NEXT:    addl %edi, %eax
 ; GFNIAVX-NEXT:    # kill: def $al killed $al killed $eax
 ; GFNIAVX-NEXT:    retq
@@ -108,14 +108,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; GFNIAVX2-NEXT:    movl %edi, %eax
 ; GFNIAVX2-NEXT:    andb $51, %al
 ; GFNIAVX2-NEXT:    shlb $2, %al
-; GFNIAVX2-NEXT:    andb $-52, %dil
 ; GFNIAVX2-NEXT:    shrb $2, %dil
+; GFNIAVX2-NEXT:    andb $51, %dil
 ; GFNIAVX2-NEXT:    orb %al, %dil
 ; GFNIAVX2-NEXT:    movl %edi, %eax
 ; GFNIAVX2-NEXT:    andb $85, %al
 ; GFNIAVX2-NEXT:    addb %al, %al
-; GFNIAVX2-NEXT:    andb $-86, %dil
 ; GFNIAVX2-NEXT:    shrb %dil
+; GFNIAVX2-NEXT:    andb $85, %dil
 ; GFNIAVX2-NEXT:    addl %edi, %eax
 ; GFNIAVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; GFNIAVX2-NEXT:    retq
@@ -127,14 +127,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; GFNIAVX512F-NEXT:    movl %edi, %eax
 ; GFNIAVX512F-NEXT:    andb $51, %al
 ; GFNIAVX512F-NEXT:    shlb $2, %al
-; GFNIAVX512F-NEXT:    andb $-52, %dil
 ; GFNIAVX512F-NEXT:    shrb $2, %dil
+; GFNIAVX512F-NEXT:    andb $51, %dil
 ; GFNIAVX512F-NEXT:    orb %al, %dil
 ; GFNIAVX512F-NEXT:    movl %edi, %eax
 ; GFNIAVX512F-NEXT:    andb $85, %al
 ; GFNIAVX512F-NEXT:    addb %al, %al
-; GFNIAVX512F-NEXT:    andb $-86, %dil
 ; GFNIAVX512F-NEXT:    shrb %dil
+; GFNIAVX512F-NEXT:    andb $85, %dil
 ; GFNIAVX512F-NEXT:    addl %edi, %eax
 ; GFNIAVX512F-NEXT:    # kill: def $al killed $al killed $eax
 ; GFNIAVX512F-NEXT:    retq
@@ -146,14 +146,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; GFNIAVX512BW-NEXT:    movl %edi, %eax
 ; GFNIAVX512BW-NEXT:    andb $51, %al
 ; GFNIAVX512BW-NEXT:    shlb $2, %al
-; GFNIAVX512BW-NEXT:    andb $-52, %dil
 ; GFNIAVX512BW-NEXT:    shrb $2, %dil
+; GFNIAVX512BW-NEXT:    andb $51, %dil
 ; GFNIAVX512BW-NEXT:    orb %al, %dil
 ; GFNIAVX512BW-NEXT:    movl %edi, %eax
 ; GFNIAVX512BW-NEXT:    andb $85, %al
 ; GFNIAVX512BW-NEXT:    addb %al, %al
-; GFNIAVX512BW-NEXT:    andb $-86, %dil
 ; GFNIAVX512BW-NEXT:    shrb %dil
+; GFNIAVX512BW-NEXT:    andb $85, %dil
 ; GFNIAVX512BW-NEXT:    addl %edi, %eax
 ; GFNIAVX512BW-NEXT:    # kill: def $al killed $al killed $eax
 ; GFNIAVX512BW-NEXT:    retq
@@ -169,18 +169,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; SSE-NEXT:    movl %edi, %eax
 ; SSE-NEXT:    andl $3855, %eax # imm = 0xF0F
 ; SSE-NEXT:    shll $4, %eax
-; SSE-NEXT:    andl $61680, %edi # imm = 0xF0F0
 ; SSE-NEXT:    shrl $4, %edi
+; SSE-NEXT:    andl $3855, %edi # imm = 0xF0F
 ; SSE-NEXT:    orl %eax, %edi
 ; SSE-NEXT:    movl %edi, %eax
 ; SSE-NEXT:    andl $13107, %eax # imm = 0x3333
-; SSE-NEXT:    andl $52428, %edi # imm = 0xCCCC
 ; SSE-NEXT:    shrl $2, %edi
+; SSE-NEXT:    andl $13107, %edi # imm = 0x3333
 ; SSE-NEXT:    leal (%rdi,%rax,4), %eax
 ; SSE-NEXT:    movl %eax, %ecx
 ; SSE-NEXT:    andl $21845, %ecx # imm = 0x5555
-; SSE-NEXT:    andl $43690, %eax # imm = 0xAAAA
 ; SSE-NEXT:    shrl %eax
+; SSE-NEXT:    andl $21845, %eax # imm = 0x5555
 ; SSE-NEXT:    leal (%rax,%rcx,2), %eax
 ; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE-NEXT:    retq
@@ -192,18 +192,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; AVX-NEXT:    movl %edi, %eax
 ; AVX-NEXT:    andl $3855, %eax # imm = 0xF0F
 ; AVX-NEXT:    shll $4, %eax
-; AVX-NEXT:    andl $61680, %edi # imm = 0xF0F0
 ; AVX-NEXT:    shrl $4, %edi
+; AVX-NEXT:    andl $3855, %edi # imm = 0xF0F
 ; AVX-NEXT:    orl %eax, %edi
 ; AVX-NEXT:    movl %edi, %eax
 ; AVX-NEXT:    andl $13107, %eax # imm = 0x3333
-; AVX-NEXT:    andl $52428, %edi # imm = 0xCCCC
 ; AVX-NEXT:    shrl $2, %edi
+; AVX-NEXT:    andl $13107, %edi # imm = 0x3333
 ; AVX-NEXT:    leal (%rdi,%rax,4), %eax
 ; AVX-NEXT:    movl %eax, %ecx
 ; AVX-NEXT:    andl $21845, %ecx # imm = 0x5555
-; AVX-NEXT:    andl $43690, %eax # imm = 0xAAAA
 ; AVX-NEXT:    shrl %eax
+; AVX-NEXT:    andl $21845, %eax # imm = 0x5555
 ; AVX-NEXT:    leal (%rax,%rcx,2), %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
@@ -223,18 +223,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; GFNISSE-NEXT:    movl %edi, %eax
 ; GFNISSE-NEXT:    andl $3855, %eax # imm = 0xF0F
 ; GFNISSE-NEXT:    shll $4, %eax
-; GFNISSE-NEXT:    andl $61680, %edi # imm = 0xF0F0
 ; GFNISSE-NEXT:    shrl $4, %edi
+; GFNISSE-NEXT:    andl $3855, %edi # imm = 0xF0F
 ; GFNISSE-NEXT:    orl %eax, %edi
 ; GFNISSE-NEXT:    movl %edi, %eax
 ; GFNISSE-NEXT:    andl $13107, %eax # imm = 0x3333
-; GFNISSE-NEXT:    andl $52428, %edi # imm = 0xCCCC
 ; GFNISSE-NEXT:    shrl $2, %edi
+; GFNISSE-NEXT:    andl $13107, %edi # imm = 0x3333
 ; GFNISSE-NEXT:    leal (%rdi,%rax,4), %eax
 ; GFNISSE-NEXT:    movl %eax, %ecx
 ; GFNISSE-NEXT:    andl $21845, %ecx # imm = 0x5555
-; GFNISSE-NEXT:    andl $43690, %eax # imm = 0xAAAA
 ; GFNISSE-NEXT:    shrl %eax
+; GFNISSE-NEXT:    andl $21845, %eax # imm = 0x5555
 ; GFNISSE-NEXT:    leal (%rax,%rcx,2), %eax
 ; GFNISSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; GFNISSE-NEXT:    retq
@@ -246,18 +246,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; GFNIAVX-NEXT:    movl %edi, %eax
 ; GFNIAVX-NEXT:    andl $3855, %eax # imm = 0xF0F
 ; GFNIAVX-NEXT:    shll $4, %eax
-; GFNIAVX-NEXT:    andl $61680, %edi # imm = 0xF0F0
 ; GFNIAVX-NEXT:    shrl $4, %edi
+; GFNIAVX-NEXT:    andl $3855, %edi # imm = 0xF0F
 ; GFNIAVX-NEXT:    orl %eax, %edi
 ; GFNIAVX-NEXT:    movl %edi, %eax
 ; GFNIAVX-NEXT:    andl $13107, %eax # imm = 0x3333
-; GFNIAVX-NEXT:    andl $52428, %edi # imm = 0xCCCC
 ; GFNIAVX-NEXT:    shrl $2, %edi
+; GFNIAVX-NEXT:    andl $13107, %edi # imm = 0x3333
 ; GFNIAVX-NEXT:    leal (%rdi,%rax,4), %eax
 ; GFNIAVX-NEXT:    movl %eax, %ecx
 ; GFNIAVX-NEXT:    andl $21845, %ecx # imm = 0x5555
-; GFNIAVX-NEXT:    andl $43690, %eax # imm = 0xAAAA
 ; GFNIAVX-NEXT:    shrl %eax
+; GFNIAVX-NEXT:    andl $21845, %eax # imm = 0x5555
 ; GFNIAVX-NEXT:    leal (%rax,%rcx,2), %eax
 ; GFNIAVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; GFNIAVX-NEXT:    retq
@@ -269,18 +269,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; GFNIAVX2-NEXT:    movl %edi, %eax
 ; GFNIAVX2-NEXT:    andl $3855, %eax # imm = 0xF0F
 ; GFNIAVX2-NEXT:    shll $4, %eax
-; GFNIAVX2-NEXT:    andl $61680, %edi # imm = 0xF0F0
 ; GFNIAVX2-NEXT:    shrl $4, %edi
+; GFNIAVX2-NEXT:    andl $3855, %edi # imm = 0xF0F
 ; GFNIAVX2-NEXT:    orl %eax, %edi
 ; GFNIAVX2-NEXT:    movl %edi, %eax
 ; GFNIAVX2-NEXT:    andl $13107, %eax # imm = 0x3333
-; GFNIAVX2-NEXT:    andl $52428, %edi # imm = 0xCCCC
 ; GFNIAVX2-NEXT:    shrl $2, %edi
+; GFNIAVX2-NEXT:    andl $13107, %edi # imm = 0x3333
 ; GFNIAVX2-NEXT:    leal (%rdi,%rax,4), %eax
 ; GFNIAVX2-NEXT:    movl %eax, %ecx
 ; GFNIAVX2-NEXT:    andl $21845, %ecx # imm = 0x5555
-; GFNIAVX2-NEXT:    andl $43690, %eax # imm = 0xAAAA
 ; GFNIAVX2-NEXT:    shrl %eax
+; GFNIAVX2-NEXT:    andl $21845, %eax # imm = 0x5555
 ; GFNIAVX2-NEXT:    leal (%rax,%rcx,2), %eax
 ; GFNIAVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; GFNIAVX2-NEXT:    retq
@@ -292,18 +292,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; GFNIAVX512F-NEXT:    movl %edi, %eax
 ; GFNIAVX512F-NEXT:    andl $3855, %eax # imm = 0xF0F
 ; GFNIAVX512F-NEXT:    shll $4, %eax
-; GFNIAVX512F-NEXT:    andl $61680, %edi # imm = 0xF0F0
 ; GFNIAVX512F-NEXT:    shrl $4, %edi
+; GFNIAVX512F-NEXT:    andl $3855, %edi # imm = 0xF0F
 ; GFNIAVX512F-NEXT:    orl %eax, %edi
 ; GFNIAVX512F-NEXT:    movl %edi, %eax
 ; GFNIAVX512F-NEXT:    andl $13107, %eax # imm = 0x3333
-; GFNIAVX512F-NEXT:    andl $52428, %edi # imm = 0xCCCC
 ; GFNIAVX512F-NEXT:    shrl $2, %edi
+; GFNIAVX512F-NEXT:    andl $13107, %edi # imm = 0x3333
 ; GFNIAVX512F-NEXT:    leal (%rdi,%rax,4), %eax
 ; GFNIAVX512F-NEXT:    movl %eax, %ecx
 ; GFNIAVX512F-NEXT:    andl $21845, %ecx # imm = 0x5555
-; GFNIAVX512F-NEXT:    andl $43690, %eax # imm = 0xAAAA
 ; GFNIAVX512F-NEXT:    shrl %eax
+; GFNIAVX512F-NEXT:    andl $21845, %eax # imm = 0x5555
 ; GFNIAVX512F-NEXT:    leal (%rax,%rcx,2), %eax
 ; GFNIAVX512F-NEXT:    # kill: def $ax killed $ax killed $eax
 ; GFNIAVX512F-NEXT:    retq
@@ -315,18 +315,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; GFNIAVX512BW-NEXT:    movl %edi, %eax
 ; GFNIAVX512BW-NEXT:    andl $3855, %eax # imm = 0xF0F
 ; GFNIAVX512BW-NEXT:    shll $4, %eax
-; GFNIAVX512BW-NEXT:    andl $61680, %edi # imm = 0xF0F0
 ; GFNIAVX512BW-NEXT:    shrl $4, %edi
+; GFNIAVX512BW-NEXT:    andl $3855, %edi # imm = 0xF0F
 ; GFNIAVX512BW-NEXT:    orl %eax, %edi
 ; GFNIAVX512BW-NEXT:    movl %edi, %eax
 ; GFNIAVX512BW-NEXT:    andl $13107, %eax # imm = 0x3333
-; GFNIAVX512BW-NEXT:    andl $52428, %edi # imm = 0xCCCC
 ; GFNIAVX512BW-NEXT:    shrl $2, %edi
+; GFNIAVX512BW-NEXT:    andl $13107, %edi # imm = 0x3333
 ; GFNIAVX512BW-NEXT:    leal (%rdi,%rax,4), %eax
 ; GFNIAVX512BW-NEXT:    movl %eax, %ecx
 ; GFNIAVX512BW-NEXT:    andl $21845, %ecx # imm = 0x5555
-; GFNIAVX512BW-NEXT:    andl $43690, %eax # imm = 0xAAAA
 ; GFNIAVX512BW-NEXT:    shrl %eax
+; GFNIAVX512BW-NEXT:    andl $21845, %eax # imm = 0x5555
 ; GFNIAVX512BW-NEXT:    leal (%rax,%rcx,2), %eax
 ; GFNIAVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
 ; GFNIAVX512BW-NEXT:    retq
@@ -342,18 +342,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; SSE-NEXT:    movl %edi, %eax
 ; SSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; SSE-NEXT:    shll $4, %eax
-; SSE-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
 ; SSE-NEXT:    shrl $4, %edi
+; SSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; SSE-NEXT:    orl %eax, %edi
 ; SSE-NEXT:    movl %edi, %eax
 ; SSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; SSE-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
 ; SSE-NEXT:    shrl $2, %edi
+; SSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
 ; SSE-NEXT:    leal (%rdi,%rax,4), %eax
 ; SSE-NEXT:    movl %eax, %ecx
 ; SSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; SSE-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; SSE-NEXT:    shrl %eax
+; SSE-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; SSE-NEXT:    leal (%rax,%rcx,2), %eax
 ; SSE-NEXT:    retq
 ;
@@ -364,18 +364,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; AVX-NEXT:    movl %edi, %eax
 ; AVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; AVX-NEXT:    shll $4, %eax
-; AVX-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
 ; AVX-NEXT:    shrl $4, %edi
+; AVX-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; AVX-NEXT:    orl %eax, %edi
 ; AVX-NEXT:    movl %edi, %eax
 ; AVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; AVX-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
 ; AVX-NEXT:    shrl $2, %edi
+; AVX-NEXT:    andl $858993459, %edi # imm = 0x33333333
 ; AVX-NEXT:    leal (%rdi,%rax,4), %eax
 ; AVX-NEXT:    movl %eax, %ecx
 ; AVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; AVX-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; AVX-NEXT:    shrl %eax
+; AVX-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; AVX-NEXT:    leal (%rax,%rcx,2), %eax
 ; AVX-NEXT:    retq
 ;
@@ -393,18 +393,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; GFNISSE-NEXT:    movl %edi, %eax
 ; GFNISSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; GFNISSE-NEXT:    shll $4, %eax
-; GFNISSE-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
 ; GFNISSE-NEXT:    shrl $4, %edi
+; GFNISSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; GFNISSE-NEXT:    orl %eax, %edi
 ; GFNISSE-NEXT:    movl %edi, %eax
 ; GFNISSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; GFNISSE-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
 ; GFNISSE-NEXT:    shrl $2, %edi
+; GFNISSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
 ; GFNISSE-NEXT:    leal (%rdi,%rax,4), %eax
 ; GFNISSE-NEXT:    movl %eax, %ecx
 ; GFNISSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; GFNISSE-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; GFNISSE-NEXT:    shrl %eax
+; GFNISSE-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; GFNISSE-NEXT:    leal (%rax,%rcx,2), %eax
 ; GFNISSE-NEXT:    retq
 ;
@@ -415,18 +415,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; GFNIAVX-NEXT:    movl %edi, %eax
 ; GFNIAVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; GFNIAVX-NEXT:    shll $4, %eax
-; GFNIAVX-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
 ; GFNIAVX-NEXT:    shrl $4, %edi
+; GFNIAVX-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; GFNIAVX-NEXT:    orl %eax, %edi
 ; GFNIAVX-NEXT:    movl %edi, %eax
 ; GFNIAVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; GFNIAVX-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
 ; GFNIAVX-NEXT:    shrl $2, %edi
+; GFNIAVX-NEXT:    andl $858993459, %edi # imm = 0x33333333
 ; GFNIAVX-NEXT:    leal (%rdi,%rax,4), %eax
 ; GFNIAVX-NEXT:    movl %eax, %ecx
 ; GFNIAVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; GFNIAVX-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; GFNIAVX-NEXT:    shrl %eax
+; GFNIAVX-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; GFNIAVX-NEXT:    leal (%rax,%rcx,2), %eax
 ; GFNIAVX-NEXT:    retq
 ;
@@ -437,18 +437,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; GFNIAVX2-NEXT:    movl %edi, %eax
 ; GFNIAVX2-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; GFNIAVX2-NEXT:    shll $4, %eax
-; GFNIAVX2-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
 ; GFNIAVX2-NEXT:    shrl $4, %edi
+; GFNIAVX2-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; GFNIAVX2-NEXT:    orl %eax, %edi
 ; GFNIAVX2-NEXT:    movl %edi, %eax
 ; GFNIAVX2-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; GFNIAVX2-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
 ; GFNIAVX2-NEXT:    shrl $2, %edi
+; GFNIAVX2-NEXT:    andl $858993459, %edi # imm = 0x33333333
 ; GFNIAVX2-NEXT:    leal (%rdi,%rax,4), %eax
 ; GFNIAVX2-NEXT:    movl %eax, %ecx
 ; GFNIAVX2-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; GFNIAVX2-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; GFNIAVX2-NEXT:    shrl %eax
+; GFNIAVX2-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; GFNIAVX2-NEXT:    leal (%rax,%rcx,2), %eax
 ; GFNIAVX2-NEXT:    retq
 ;
@@ -459,18 +459,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; GFNIAVX512F-NEXT:    movl %edi, %eax
 ; GFNIAVX512F-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; GFNIAVX512F-NEXT:    shll $4, %eax
-; GFNIAVX512F-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
 ; GFNIAVX512F-NEXT:    shrl $4, %edi
+; GFNIAVX512F-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; GFNIAVX512F-NEXT:    orl %eax, %edi
 ; GFNIAVX512F-NEXT:    movl %edi, %eax
 ; GFNIAVX512F-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; GFNIAVX512F-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
 ; GFNIAVX512F-NEXT:    shrl $2, %edi
+; GFNIAVX512F-NEXT:    andl $858993459, %edi # imm = 0x33333333
 ; GFNIAVX512F-NEXT:    leal (%rdi,%rax,4), %eax
 ; GFNIAVX512F-NEXT:    movl %eax, %ecx
 ; GFNIAVX512F-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; GFNIAVX512F-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; GFNIAVX512F-NEXT:    shrl %eax
+; GFNIAVX512F-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; GFNIAVX512F-NEXT:    leal (%rax,%rcx,2), %eax
 ; GFNIAVX512F-NEXT:    retq
 ;
@@ -481,18 +481,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; GFNIAVX512BW-NEXT:    movl %edi, %eax
 ; GFNIAVX512BW-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
 ; GFNIAVX512BW-NEXT:    shll $4, %eax
-; GFNIAVX512BW-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
 ; GFNIAVX512BW-NEXT:    shrl $4, %edi
+; GFNIAVX512BW-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; GFNIAVX512BW-NEXT:    orl %eax, %edi
 ; GFNIAVX512BW-NEXT:    movl %edi, %eax
 ; GFNIAVX512BW-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; GFNIAVX512BW-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
 ; GFNIAVX512BW-NEXT:    shrl $2, %edi
+; GFNIAVX512BW-NEXT:    andl $858993459, %edi # imm = 0x33333333
 ; GFNIAVX512BW-NEXT:    leal (%rdi,%rax,4), %eax
 ; GFNIAVX512BW-NEXT:    movl %eax, %ecx
 ; GFNIAVX512BW-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; GFNIAVX512BW-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
 ; GFNIAVX512BW-NEXT:    shrl %eax
+; GFNIAVX512BW-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; GFNIAVX512BW-NEXT:    leal (%rax,%rcx,2), %eax
 ; GFNIAVX512BW-NEXT:    retq
   %b = call i32 @llvm.bitreverse.i32(i32 %a)
@@ -503,49 +503,49 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; SSE-LABEL: test_bitreverse_i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    bswapq %rdi
-; SSE-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; SSE-NEXT:    andq %rdi, %rax
-; SSE-NEXT:    shlq $4, %rax
-; SSE-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; SSE-NEXT:    andq %rdi, %rcx
-; SSE-NEXT:    shrq $4, %rcx
-; SSE-NEXT:    orq %rax, %rcx
-; SSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    shrq $4, %rax
+; SSE-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
 ; SSE-NEXT:    andq %rcx, %rax
-; SSE-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; SSE-NEXT:    andq %rcx, %rdx
-; SSE-NEXT:    shrq $2, %rdx
-; SSE-NEXT:    leaq (%rdx,%rax,4), %rax
-; SSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; SSE-NEXT:    andq %rcx, %rdi
+; SSE-NEXT:    shlq $4, %rdi
+; SSE-NEXT:    orq %rax, %rdi
+; SSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; SSE-NEXT:    movq %rdi, %rcx
 ; SSE-NEXT:    andq %rax, %rcx
-; SSE-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; SSE-NEXT:    andq %rax, %rdx
-; SSE-NEXT:    shrq %rdx
-; SSE-NEXT:    leaq (%rdx,%rcx,2), %rax
+; SSE-NEXT:    shrq $2, %rdi
+; SSE-NEXT:    andq %rax, %rdi
+; SSE-NEXT:    leaq (%rdi,%rcx,4), %rax
+; SSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    shrq %rax
+; SSE-NEXT:    andq %rcx, %rax
+; SSE-NEXT:    leaq (%rax,%rdx,2), %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_bitreverse_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    bswapq %rdi
-; AVX-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; AVX-NEXT:    andq %rdi, %rax
-; AVX-NEXT:    shlq $4, %rax
-; AVX-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; AVX-NEXT:    andq %rdi, %rcx
-; AVX-NEXT:    shrq $4, %rcx
-; AVX-NEXT:    orq %rax, %rcx
-; AVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    shrq $4, %rax
+; AVX-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
 ; AVX-NEXT:    andq %rcx, %rax
-; AVX-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; AVX-NEXT:    andq %rcx, %rdx
-; AVX-NEXT:    shrq $2, %rdx
-; AVX-NEXT:    leaq (%rdx,%rax,4), %rax
-; AVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; AVX-NEXT:    andq %rcx, %rdi
+; AVX-NEXT:    shlq $4, %rdi
+; AVX-NEXT:    orq %rax, %rdi
+; AVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; AVX-NEXT:    movq %rdi, %rcx
 ; AVX-NEXT:    andq %rax, %rcx
-; AVX-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; AVX-NEXT:    andq %rax, %rdx
-; AVX-NEXT:    shrq %rdx
-; AVX-NEXT:    leaq (%rdx,%rcx,2), %rax
+; AVX-NEXT:    shrq $2, %rdi
+; AVX-NEXT:    andq %rax, %rdi
+; AVX-NEXT:    leaq (%rdi,%rcx,4), %rax
+; AVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; AVX-NEXT:    movq %rax, %rdx
+; AVX-NEXT:    andq %rcx, %rdx
+; AVX-NEXT:    shrq %rax
+; AVX-NEXT:    andq %rcx, %rax
+; AVX-NEXT:    leaq (%rax,%rdx,2), %rax
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: test_bitreverse_i64:
@@ -558,121 +558,121 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; GFNISSE-LABEL: test_bitreverse_i64:
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    bswapq %rdi
-; GFNISSE-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; GFNISSE-NEXT:    andq %rdi, %rax
-; GFNISSE-NEXT:    shlq $4, %rax
-; GFNISSE-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; GFNISSE-NEXT:    andq %rdi, %rcx
-; GFNISSE-NEXT:    shrq $4, %rcx
-; GFNISSE-NEXT:    orq %rax, %rcx
-; GFNISSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNISSE-NEXT:    movq %rdi, %rax
+; GFNISSE-NEXT:    shrq $4, %rax
+; GFNISSE-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
 ; GFNISSE-NEXT:    andq %rcx, %rax
-; GFNISSE-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; GFNISSE-NEXT:    andq %rcx, %rdx
-; GFNISSE-NEXT:    shrq $2, %rdx
-; GFNISSE-NEXT:    leaq (%rdx,%rax,4), %rax
-; GFNISSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNISSE-NEXT:    andq %rcx, %rdi
+; GFNISSE-NEXT:    shlq $4, %rdi
+; GFNISSE-NEXT:    orq %rax, %rdi
+; GFNISSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNISSE-NEXT:    movq %rdi, %rcx
 ; GFNISSE-NEXT:    andq %rax, %rcx
-; GFNISSE-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; GFNISSE-NEXT:    andq %rax, %rdx
-; GFNISSE-NEXT:    shrq %rdx
-; GFNISSE-NEXT:    leaq (%rdx,%rcx,2), %rax
+; GFNISSE-NEXT:    shrq $2, %rdi
+; GFNISSE-NEXT:    andq %rax, %rdi
+; GFNISSE-NEXT:    leaq (%rdi,%rcx,4), %rax
+; GFNISSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNISSE-NEXT:    movq %rax, %rdx
+; GFNISSE-NEXT:    andq %rcx, %rdx
+; GFNISSE-NEXT:    shrq %rax
+; GFNISSE-NEXT:    andq %rcx, %rax
+; GFNISSE-NEXT:    leaq (%rax,%rdx,2), %rax
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX-LABEL: test_bitreverse_i64:
 ; GFNIAVX:       # %bb.0:
 ; GFNIAVX-NEXT:    bswapq %rdi
-; GFNIAVX-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; GFNIAVX-NEXT:    andq %rdi, %rax
-; GFNIAVX-NEXT:    shlq $4, %rax
-; GFNIAVX-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; GFNIAVX-NEXT:    andq %rdi, %rcx
-; GFNIAVX-NEXT:    shrq $4, %rcx
-; GFNIAVX-NEXT:    orq %rax, %rcx
-; GFNIAVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX-NEXT:    movq %rdi, %rax
+; GFNIAVX-NEXT:    shrq $4, %rax
+; GFNIAVX-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
 ; GFNIAVX-NEXT:    andq %rcx, %rax
-; GFNIAVX-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; GFNIAVX-NEXT:    andq %rcx, %rdx
-; GFNIAVX-NEXT:    shrq $2, %rdx
-; GFNIAVX-NEXT:    leaq (%rdx,%rax,4), %rax
-; GFNIAVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX-NEXT:    andq %rcx, %rdi
+; GFNIAVX-NEXT:    shlq $4, %rdi
+; GFNIAVX-NEXT:    orq %rax, %rdi
+; GFNIAVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX-NEXT:    movq %rdi, %rcx
 ; GFNIAVX-NEXT:    andq %rax, %rcx
-; GFNIAVX-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; GFNIAVX-NEXT:    andq %rax, %rdx
-; GFNIAVX-NEXT:    shrq %rdx
-; GFNIAVX-NEXT:    leaq (%rdx,%rcx,2), %rax
+; GFNIAVX-NEXT:    shrq $2, %rdi
+; GFNIAVX-NEXT:    andq %rax, %rdi
+; GFNIAVX-NEXT:    leaq (%rdi,%rcx,4), %rax
+; GFNIAVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX-NEXT:    movq %rax, %rdx
+; GFNIAVX-NEXT:    andq %rcx, %rdx
+; GFNIAVX-NEXT:    shrq %rax
+; GFNIAVX-NEXT:    andq %rcx, %rax
+; GFNIAVX-NEXT:    leaq (%rax,%rdx,2), %rax
 ; GFNIAVX-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: test_bitreverse_i64:
 ; GFNIAVX2:       # %bb.0:
 ; GFNIAVX2-NEXT:    bswapq %rdi
-; GFNIAVX2-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; GFNIAVX2-NEXT:    andq %rdi, %rax
-; GFNIAVX2-NEXT:    shlq $4, %rax
-; GFNIAVX2-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; GFNIAVX2-NEXT:    andq %rdi, %rcx
-; GFNIAVX2-NEXT:    shrq $4, %rcx
-; GFNIAVX2-NEXT:    orq %rax, %rcx
-; GFNIAVX2-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX2-NEXT:    movq %rdi, %rax
+; GFNIAVX2-NEXT:    shrq $4, %rax
+; GFNIAVX2-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
 ; GFNIAVX2-NEXT:    andq %rcx, %rax
-; GFNIAVX2-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; GFNIAVX2-NEXT:    andq %rcx, %rdx
-; GFNIAVX2-NEXT:    shrq $2, %rdx
-; GFNIAVX2-NEXT:    leaq (%rdx,%rax,4), %rax
-; GFNIAVX2-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX2-NEXT:    andq %rcx, %rdi
+; GFNIAVX2-NEXT:    shlq $4, %rdi
+; GFNIAVX2-NEXT:    orq %rax, %rdi
+; GFNIAVX2-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX2-NEXT:    movq %rdi, %rcx
 ; GFNIAVX2-NEXT:    andq %rax, %rcx
-; GFNIAVX2-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; GFNIAVX2-NEXT:    andq %rax, %rdx
-; GFNIAVX2-NEXT:    shrq %rdx
-; GFNIAVX2-NEXT:    leaq (%rdx,%rcx,2), %rax
+; GFNIAVX2-NEXT:    shrq $2, %rdi
+; GFNIAVX2-NEXT:    andq %rax, %rdi
+; GFNIAVX2-NEXT:    leaq (%rdi,%rcx,4), %rax
+; GFNIAVX2-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX2-NEXT:    movq %rax, %rdx
+; GFNIAVX2-NEXT:    andq %rcx, %rdx
+; GFNIAVX2-NEXT:    shrq %rax
+; GFNIAVX2-NEXT:    andq %rcx, %rax
+; GFNIAVX2-NEXT:    leaq (%rax,%rdx,2), %rax
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512F-LABEL: test_bitreverse_i64:
 ; GFNIAVX512F:       # %bb.0:
 ; GFNIAVX512F-NEXT:    bswapq %rdi
-; GFNIAVX512F-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; GFNIAVX512F-NEXT:    andq %rdi, %rax
-; GFNIAVX512F-NEXT:    shlq $4, %rax
-; GFNIAVX512F-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; GFNIAVX512F-NEXT:    andq %rdi, %rcx
-; GFNIAVX512F-NEXT:    shrq $4, %rcx
-; GFNIAVX512F-NEXT:    orq %rax, %rcx
-; GFNIAVX512F-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX512F-NEXT:    movq %rdi, %rax
+; GFNIAVX512F-NEXT:    shrq $4, %rax
+; GFNIAVX512F-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
 ; GFNIAVX512F-NEXT:    andq %rcx, %rax
-; GFNIAVX512F-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; GFNIAVX512F-NEXT:    andq %rcx, %rdx
-; GFNIAVX512F-NEXT:    shrq $2, %rdx
-; GFNIAVX512F-NEXT:    leaq (%rdx,%rax,4), %rax
-; GFNIAVX512F-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX512F-NEXT:    andq %rcx, %rdi
+; GFNIAVX512F-NEXT:    shlq $4, %rdi
+; GFNIAVX512F-NEXT:    orq %rax, %rdi
+; GFNIAVX512F-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX512F-NEXT:    movq %rdi, %rcx
 ; GFNIAVX512F-NEXT:    andq %rax, %rcx
-; GFNIAVX512F-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; GFNIAVX512F-NEXT:    andq %rax, %rdx
-; GFNIAVX512F-NEXT:    shrq %rdx
-; GFNIAVX512F-NEXT:    leaq (%rdx,%rcx,2), %rax
+; GFNIAVX512F-NEXT:    shrq $2, %rdi
+; GFNIAVX512F-NEXT:    andq %rax, %rdi
+; GFNIAVX512F-NEXT:    leaq (%rdi,%rcx,4), %rax
+; GFNIAVX512F-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX512F-NEXT:    movq %rax, %rdx
+; GFNIAVX512F-NEXT:    andq %rcx, %rdx
+; GFNIAVX512F-NEXT:    shrq %rax
+; GFNIAVX512F-NEXT:    andq %rcx, %rax
+; GFNIAVX512F-NEXT:    leaq (%rax,%rdx,2), %rax
 ; GFNIAVX512F-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: test_bitreverse_i64:
 ; GFNIAVX512BW:       # %bb.0:
 ; GFNIAVX512BW-NEXT:    bswapq %rdi
-; GFNIAVX512BW-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; GFNIAVX512BW-NEXT:    andq %rdi, %rax
-; GFNIAVX512BW-NEXT:    shlq $4, %rax
-; GFNIAVX512BW-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; GFNIAVX512BW-NEXT:    andq %rdi, %rcx
-; GFNIAVX512BW-NEXT:    shrq $4, %rcx
-; GFNIAVX512BW-NEXT:    orq %rax, %rcx
-; GFNIAVX512BW-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX512BW-NEXT:    movq %rdi, %rax
+; GFNIAVX512BW-NEXT:    shrq $4, %rax
+; GFNIAVX512BW-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
 ; GFNIAVX512BW-NEXT:    andq %rcx, %rax
-; GFNIAVX512BW-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; GFNIAVX512BW-NEXT:    andq %rcx, %rdx
-; GFNIAVX512BW-NEXT:    shrq $2, %rdx
-; GFNIAVX512BW-NEXT:    leaq (%rdx,%rax,4), %rax
-; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT:    andq %rcx, %rdi
+; GFNIAVX512BW-NEXT:    shlq $4, %rdi
+; GFNIAVX512BW-NEXT:    orq %rax, %rdi
+; GFNIAVX512BW-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX512BW-NEXT:    movq %rdi, %rcx
 ; GFNIAVX512BW-NEXT:    andq %rax, %rcx
-; GFNIAVX512BW-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; GFNIAVX512BW-NEXT:    andq %rax, %rdx
-; GFNIAVX512BW-NEXT:    shrq %rdx
-; GFNIAVX512BW-NEXT:    leaq (%rdx,%rcx,2), %rax
+; GFNIAVX512BW-NEXT:    shrq $2, %rdi
+; GFNIAVX512BW-NEXT:    andq %rax, %rdi
+; GFNIAVX512BW-NEXT:    leaq (%rdi,%rcx,4), %rax
+; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT:    movq %rax, %rdx
+; GFNIAVX512BW-NEXT:    andq %rcx, %rdx
+; GFNIAVX512BW-NEXT:    shrq %rax
+; GFNIAVX512BW-NEXT:    andq %rcx, %rax
+; GFNIAVX512BW-NEXT:    leaq (%rax,%rdx,2), %rax
 ; GFNIAVX512BW-NEXT:    retq
   %b = call i64 @llvm.bitreverse.i64(i64 %a)
   ret i64 %b
@@ -687,16 +687,18 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
 ; SSE2-NEXT:    psrlw $4, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psllw $2, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $2, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    psllw $2, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
@@ -775,16 +777,18 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
 ; SSE2-NEXT:    psrlw $4, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psllw $2, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $2, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    psllw $2, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
@@ -875,16 +879,18 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
 ; SSE2-NEXT:    psrlw $4, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psllw $2, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $2, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    psllw $2, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
@@ -977,16 +983,18 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
 ; SSE2-NEXT:    psrlw $4, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psllw $2, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $2, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    psllw $2, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
@@ -1071,38 +1079,38 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
 ; SSE2-NEXT:    psrlw $4, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $2, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    psllw $2, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    psllw $2, %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm4, %xmm6
-; SSE2-NEXT:    psrlw $1, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm0
 ; SSE2-NEXT:    paddb %xmm0, %xmm0
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    psllw $4, %xmm6
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psllw $4, %xmm5
 ; SSE2-NEXT:    psrlw $4, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    pandn %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm1
 ; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    psllw $2, %xmm3
-; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm1
-; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    psrlw $1, %xmm4
-; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    psllw $2, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_bitreverse_v32i8:
@@ -1248,42 +1256,42 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
 ; SSE2-NEXT:    psrlw $4, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $2, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    psllw $2, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    psllw $2, %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NEXT:    pand %xmm4, %xmm7
-; SSE2-NEXT:    psrlw $1, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm0
 ; SSE2-NEXT:    paddb %xmm0, %xmm0
-; SSE2-NEXT:    por %xmm7, %xmm0
-; SSE2-NEXT:    movdqa %xmm2, %xmm7
-; SSE2-NEXT:    psrlw $8, %xmm7
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psrlw $8, %xmm5
 ; SSE2-NEXT:    psllw $8, %xmm2
-; SSE2-NEXT:    por %xmm7, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm7
-; SSE2-NEXT:    psllw $4, %xmm7
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psllw $4, %xmm5
 ; SSE2-NEXT:    psrlw $4, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    pandn %xmm7, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm1
 ; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    psllw $2, %xmm3
-; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm1
-; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    psrlw $1, %xmm4
-; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    psllw $2, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_bitreverse_v16i16:
@@ -1434,63 +1442,63 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
 ; SSE2-LABEL: test_bitreverse_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; SSE2-NEXT:    packuswb %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psllw $4, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psllw $4, %xmm4
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    pandn %xmm4, %xmm5
 ; SSE2-NEXT:    psrlw $4, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    por %xmm5, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm5
-; SSE2-NEXT:    psllw $2, %xmm5
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
-; SSE2-NEXT:    pand %xmm8, %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    psrlw $2, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    psllw $2, %xmm0
 ; SSE2-NEXT:    por %xmm5, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm5, %xmm6
 ; SSE2-NEXT:    psrlw $1, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT:    pand %xmm5, %xmm6
+; SSE2-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NEXT:    por %xmm6, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
 ; SSE2-NEXT:    packuswb %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psllw $4, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psllw $4, %xmm3
 ; SSE2-NEXT:    psrlw $4, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    pandn %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm1
 ; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    psllw $2, %xmm3
-; SSE2-NEXT:    pand %xmm8, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm1
-; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm5
-; SSE2-NEXT:    psrlw $1, %xmm5
-; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    psllw $2, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_bitreverse_v8i32:
@@ -1641,67 +1649,67 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
 ; SSE2-LABEL: test_bitreverse_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; SSE2-NEXT:    packuswb %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psllw $4, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psllw $4, %xmm4
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    pandn %xmm4, %xmm5
 ; SSE2-NEXT:    psrlw $4, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    por %xmm5, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm5
-; SSE2-NEXT:    psllw $2, %xmm5
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
-; SSE2-NEXT:    pand %xmm8, %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    psrlw $2, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    psllw $2, %xmm0
 ; SSE2-NEXT:    por %xmm5, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm5, %xmm6
 ; SSE2-NEXT:    psrlw $1, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT:    pand %xmm5, %xmm6
+; SSE2-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NEXT:    por %xmm6, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
 ; SSE2-NEXT:    packuswb %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psllw $4, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psllw $4, %xmm3
 ; SSE2-NEXT:    psrlw $4, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    pandn %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm1
 ; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    psllw $2, %xmm3
-; SSE2-NEXT:    pand %xmm8, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm1
-; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm5
-; SSE2-NEXT:    psrlw $1, %xmm5
-; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    psllw $2, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_bitreverse_v4i64:
@@ -1851,7 +1859,7 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
 define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
 ; SSE2-LABEL: test_bitreverse_v64i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm3, %xmm10
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; SSE2-NEXT:    psllw $4, %xmm5
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -1860,76 +1868,76 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
 ; SSE2-NEXT:    psrlw $4, %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
 ; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm5, %xmm6
-; SSE2-NEXT:    psllw $2, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
+; SSE2-NEXT:    psrlw $2, %xmm6
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    pand %xmm8, %xmm6
 ; SSE2-NEXT:    pand %xmm8, %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    psllw $2, %xmm0
 ; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NEXT:    pand %xmm6, %xmm7
 ; SSE2-NEXT:    psrlw $1, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; SSE2-NEXT:    pand %xmm9, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT:    pand %xmm6, %xmm7
+; SSE2-NEXT:    pand %xmm6, %xmm0
 ; SSE2-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NEXT:    por %xmm7, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
 ; SSE2-NEXT:    psllw $4, %xmm7
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pandn %xmm7, %xmm5
 ; SSE2-NEXT:    psrlw $4, %xmm1
 ; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm5, %xmm4
-; SSE2-NEXT:    psllw $2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrlw $2, %xmm5
+; SSE2-NEXT:    pand %xmm8, %xmm5
 ; SSE2-NEXT:    pand %xmm8, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm6, %xmm4
-; SSE2-NEXT:    psrlw $1, %xmm4
-; SSE2-NEXT:    pand %xmm9, %xmm1
+; SSE2-NEXT:    psllw $2, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    pand %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm6, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psllw $4, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psllw $4, %xmm5
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    pandn %xmm4, %xmm7
+; SSE2-NEXT:    pandn %xmm5, %xmm7
 ; SSE2-NEXT:    psrlw $4, %xmm2
 ; SSE2-NEXT:    pand %xmm3, %xmm2
 ; SSE2-NEXT:    por %xmm7, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pand %xmm5, %xmm4
-; SSE2-NEXT:    psllw $2, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psrlw $2, %xmm5
+; SSE2-NEXT:    pand %xmm8, %xmm5
 ; SSE2-NEXT:    pand %xmm8, %xmm2
-; SSE2-NEXT:    psrlw $2, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pand %xmm6, %xmm4
-; SSE2-NEXT:    psrlw $1, %xmm4
-; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    psllw $2, %xmm2
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    pand %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm6, %xmm2
 ; SSE2-NEXT:    paddb %xmm2, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm10, %xmm4
-; SSE2-NEXT:    psllw $4, %xmm4
-; SSE2-NEXT:    psrlw $4, %xmm10
-; SSE2-NEXT:    pand %xmm3, %xmm10
-; SSE2-NEXT:    pandn %xmm4, %xmm3
-; SSE2-NEXT:    por %xmm10, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm5
-; SSE2-NEXT:    psllw $2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    psllw $4, %xmm5
+; SSE2-NEXT:    psrlw $4, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm5, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psrlw $2, %xmm4
+; SSE2-NEXT:    pand %xmm8, %xmm4
 ; SSE2-NEXT:    pand %xmm8, %xmm3
-; SSE2-NEXT:    psrlw $2, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm6
-; SSE2-NEXT:    psrlw $1, %xmm6
-; SSE2-NEXT:    pand %xmm9, %xmm3
+; SSE2-NEXT:    psllw $2, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pand %xmm6, %xmm3
 ; SSE2-NEXT:    paddb %xmm3, %xmm3
-; SSE2-NEXT:    por %xmm6, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_bitreverse_v64i8:
@@ -2152,20 +2160,18 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ; SSE2-NEXT:    psrlw $4, %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
 ; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm10, %xmm6
-; SSE2-NEXT:    psllw $2, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
+; SSE2-NEXT:    psrlw $2, %xmm6
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    pand %xmm8, %xmm6
 ; SSE2-NEXT:    pand %xmm8, %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    psllw $2, %xmm0
 ; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NEXT:    pand %xmm6, %xmm7
 ; SSE2-NEXT:    psrlw $1, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; SSE2-NEXT:    pand %xmm9, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT:    pand %xmm6, %xmm7
+; SSE2-NEXT:    pand %xmm6, %xmm0
 ; SSE2-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NEXT:    por %xmm7, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
@@ -2180,15 +2186,15 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ; SSE2-NEXT:    pand %xmm3, %xmm1
 ; SSE2-NEXT:    por %xmm5, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    pand %xmm10, %xmm5
-; SSE2-NEXT:    psllw $2, %xmm5
+; SSE2-NEXT:    psrlw $2, %xmm5
+; SSE2-NEXT:    pand %xmm8, %xmm5
 ; SSE2-NEXT:    pand %xmm8, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm1
+; SSE2-NEXT:    psllw $2, %xmm1
 ; SSE2-NEXT:    por %xmm5, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    pand %xmm6, %xmm5
 ; SSE2-NEXT:    psrlw $1, %xmm5
-; SSE2-NEXT:    pand %xmm9, %xmm1
+; SSE2-NEXT:    pand %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm6, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
 ; SSE2-NEXT:    por %xmm5, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
@@ -2203,15 +2209,15 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ; SSE2-NEXT:    pand %xmm3, %xmm2
 ; SSE2-NEXT:    por %xmm7, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pand %xmm10, %xmm5
-; SSE2-NEXT:    psllw $2, %xmm5
+; SSE2-NEXT:    psrlw $2, %xmm5
+; SSE2-NEXT:    pand %xmm8, %xmm5
 ; SSE2-NEXT:    pand %xmm8, %xmm2
-; SSE2-NEXT:    psrlw $2, %xmm2
+; SSE2-NEXT:    psllw $2, %xmm2
 ; SSE2-NEXT:    por %xmm5, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pand %xmm6, %xmm5
 ; SSE2-NEXT:    psrlw $1, %xmm5
-; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    pand %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm6, %xmm2
 ; SSE2-NEXT:    paddb %xmm2, %xmm2
 ; SSE2-NEXT:    por %xmm5, %xmm2
 ; SSE2-NEXT:    movdqa %xmm4, %xmm5
@@ -2224,16 +2230,18 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ; SSE2-NEXT:    pand %xmm3, %xmm4
 ; SSE2-NEXT:    pandn %xmm5, %xmm3
 ; SSE2-NEXT:    por %xmm4, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm10
-; SSE2-NEXT:    psllw $2, %xmm10
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psrlw $2, %xmm4
+; SSE2-NEXT:    pand %xmm8, %xmm4
 ; SSE2-NEXT:    pand %xmm8, %xmm3
-; SSE2-NEXT:    psrlw $2, %xmm3
-; SSE2-NEXT:    por %xmm10, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm6
-; SSE2-NEXT:    psrlw $1, %xmm6
-; SSE2-NEXT:    pand %xmm9, %xmm3
+; SSE2-NEXT:    psllw $2, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pand %xmm6, %xmm3
 ; SSE2-NEXT:    paddb %xmm3, %xmm3
-; SSE2-NEXT:    por %xmm6, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_bitreverse_v32i16:
@@ -2478,118 +2486,118 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 ; SSE2-LABEL: test_bitreverse_v16i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm3, %xmm11
-; SSE2-NEXT:    pxor %xmm10, %xmm10
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm8
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; SSE2-NEXT:    packuswb %xmm3, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    psllw $4, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    psllw $4, %xmm6
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    pandn %xmm5, %xmm7
+; SSE2-NEXT:    pandn %xmm6, %xmm7
 ; SSE2-NEXT:    psrlw $4, %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
 ; SSE2-NEXT:    por %xmm7, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NEXT:    pand %xmm5, %xmm7
-; SSE2-NEXT:    psllw $2, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
-; SSE2-NEXT:    pand %xmm8, %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
-; SSE2-NEXT:    por %xmm7, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm7, %xmm6
-; SSE2-NEXT:    psrlw $1, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT:    psrlw $2, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    pand %xmm9, %xmm7
 ; SSE2-NEXT:    pand %xmm9, %xmm0
+; SSE2-NEXT:    psllw $2, %xmm0
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT:    pand %xmm7, %xmm5
+; SSE2-NEXT:    pand %xmm7, %xmm0
 ; SSE2-NEXT:    paddb %xmm0, %xmm0
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    packuswb %xmm6, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NEXT:    psllw $4, %xmm6
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm6, %xmm4
+; SSE2-NEXT:    packuswb %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psllw $4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pandn %xmm5, %xmm6
 ; SSE2-NEXT:    psrlw $4, %xmm1
 ; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm5, %xmm4
-; SSE2-NEXT:    psllw $2, %xmm4
-; SSE2-NEXT:    pand %xmm8, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm7, %xmm4
-; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrlw $2, %xmm5
+; SSE2-NEXT:    pand %xmm9, %xmm5
 ; SSE2-NEXT:    pand %xmm9, %xmm1
+; SSE2-NEXT:    psllw $2, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    pand %xmm7, %xmm5
+; SSE2-NEXT:    pand %xmm7, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    packuswb %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psllw $4, %xmm4
+; SSE2-NEXT:    packuswb %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psllw $4, %xmm5
 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    pandn %xmm4, %xmm6
+; SSE2-NEXT:    pandn %xmm5, %xmm6
 ; SSE2-NEXT:    psrlw $4, %xmm2
 ; SSE2-NEXT:    pand %xmm3, %xmm2
 ; SSE2-NEXT:    por %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pand %xmm5, %xmm4
-; SSE2-NEXT:    psllw $2, %xmm4
-; SSE2-NEXT:    pand %xmm8, %xmm2
-; SSE2-NEXT:    psrlw $2, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pand %xmm7, %xmm4
-; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psrlw $2, %xmm5
+; SSE2-NEXT:    pand %xmm9, %xmm5
 ; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    psllw $2, %xmm2
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    pand %xmm7, %xmm5
+; SSE2-NEXT:    pand %xmm7, %xmm2
 ; SSE2-NEXT:    paddb %xmm2, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm11, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm11[3,2,1,0,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    packuswb %xmm4, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
-; SSE2-NEXT:    psllw $4, %xmm4
-; SSE2-NEXT:    psrlw $4, %xmm6
-; SSE2-NEXT:    pand %xmm3, %xmm6
-; SSE2-NEXT:    pandn %xmm4, %xmm3
-; SSE2-NEXT:    por %xmm6, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm5
-; SSE2-NEXT:    psllw $2, %xmm5
-; SSE2-NEXT:    pand %xmm8, %xmm3
-; SSE2-NEXT:    psrlw $2, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm7
-; SSE2-NEXT:    psrlw $1, %xmm7
+; SSE2-NEXT:    packuswb %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    psllw $4, %xmm5
+; SSE2-NEXT:    psrlw $4, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm5, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psrlw $2, %xmm4
+; SSE2-NEXT:    pand %xmm9, %xmm4
 ; SSE2-NEXT:    pand %xmm9, %xmm3
+; SSE2-NEXT:    psllw $2, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pand %xmm7, %xmm3
 ; SSE2-NEXT:    paddb %xmm3, %xmm3
-; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_bitreverse_v16i32:
@@ -2834,126 +2842,126 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
 ; SSE2-LABEL: test_bitreverse_v8i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm3, %xmm11
-; SSE2-NEXT:    pxor %xmm10, %xmm10
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm8
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; SSE2-NEXT:    packuswb %xmm3, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    psllw $4, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    psllw $4, %xmm6
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    pandn %xmm5, %xmm7
+; SSE2-NEXT:    pandn %xmm6, %xmm7
 ; SSE2-NEXT:    psrlw $4, %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
 ; SSE2-NEXT:    por %xmm7, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NEXT:    pand %xmm5, %xmm7
-; SSE2-NEXT:    psllw $2, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
-; SSE2-NEXT:    pand %xmm8, %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
-; SSE2-NEXT:    por %xmm7, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm7, %xmm6
-; SSE2-NEXT:    psrlw $1, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT:    psrlw $2, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    pand %xmm9, %xmm7
 ; SSE2-NEXT:    pand %xmm9, %xmm0
+; SSE2-NEXT:    psllw $2, %xmm0
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT:    pand %xmm7, %xmm5
+; SSE2-NEXT:    pand %xmm7, %xmm0
 ; SSE2-NEXT:    paddb %xmm0, %xmm0
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    packuswb %xmm6, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NEXT:    psllw $4, %xmm6
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm6, %xmm4
+; SSE2-NEXT:    packuswb %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psllw $4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pandn %xmm5, %xmm6
 ; SSE2-NEXT:    psrlw $4, %xmm1
 ; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm5, %xmm4
-; SSE2-NEXT:    psllw $2, %xmm4
-; SSE2-NEXT:    pand %xmm8, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm7, %xmm4
-; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrlw $2, %xmm5
+; SSE2-NEXT:    pand %xmm9, %xmm5
 ; SSE2-NEXT:    pand %xmm9, %xmm1
+; SSE2-NEXT:    psllw $2, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    pand %xmm7, %xmm5
+; SSE2-NEXT:    pand %xmm7, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    packuswb %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psllw $4, %xmm4
+; SSE2-NEXT:    packuswb %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psllw $4, %xmm5
 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    pandn %xmm4, %xmm6
+; SSE2-NEXT:    pandn %xmm5, %xmm6
 ; SSE2-NEXT:    psrlw $4, %xmm2
 ; SSE2-NEXT:    pand %xmm3, %xmm2
 ; SSE2-NEXT:    por %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pand %xmm5, %xmm4
-; SSE2-NEXT:    psllw $2, %xmm4
-; SSE2-NEXT:    pand %xmm8, %xmm2
-; SSE2-NEXT:    psrlw $2, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pand %xmm7, %xmm4
-; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psrlw $2, %xmm5
+; SSE2-NEXT:    pand %xmm9, %xmm5
 ; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    psllw $2, %xmm2
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    pand %xmm7, %xmm5
+; SSE2-NEXT:    pand %xmm7, %xmm2
 ; SSE2-NEXT:    paddb %xmm2, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm11, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm11[2,3,0,1]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    packuswb %xmm4, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
-; SSE2-NEXT:    psllw $4, %xmm4
-; SSE2-NEXT:    psrlw $4, %xmm6
-; SSE2-NEXT:    pand %xmm3, %xmm6
-; SSE2-NEXT:    pandn %xmm4, %xmm3
-; SSE2-NEXT:    por %xmm6, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm5
-; SSE2-NEXT:    psllw $2, %xmm5
-; SSE2-NEXT:    pand %xmm8, %xmm3
-; SSE2-NEXT:    psrlw $2, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm7
-; SSE2-NEXT:    psrlw $1, %xmm7
+; SSE2-NEXT:    packuswb %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    psllw $4, %xmm5
+; SSE2-NEXT:    psrlw $4, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm5, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psrlw $2, %xmm4
+; SSE2-NEXT:    pand %xmm9, %xmm4
 ; SSE2-NEXT:    pand %xmm9, %xmm3
+; SSE2-NEXT:    psllw $2, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pand %xmm7, %xmm3
 ; SSE2-NEXT:    paddb %xmm3, %xmm3
-; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_bitreverse_v8i64:


        


More information about the llvm-commits mailing list