[llvm] 8bb2428 - [SelectionDAG] Optimize bitreverse expansion to minimize the number of mask constants.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 26 09:33:41 PDT 2021
Author: Craig Topper
Date: 2021-08-26T09:33:24-07:00
New Revision: 8bb24289f3ac2bcf36d44d4951dc1a5e6822ae7b
URL: https://github.com/llvm/llvm-project/commit/8bb24289f3ac2bcf36d44d4951dc1a5e6822ae7b
DIFF: https://github.com/llvm/llvm-project/commit/8bb24289f3ac2bcf36d44d4951dc1a5e6822ae7b.diff
LOG: [SelectionDAG] Optimize bitreverse expansion to minimize the number of mask constants.
We can halve the number of mask constants by masking before shl
and after srl.
This can reduce the number of mov immediate or constant
materializations. Or reduce the number of constant pool loads
for X86 vectors.
I think we might be able to do something similar for bswap. I'll
look at it next.
Differential Revision: https://reviews.llvm.org/D108738
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/test/CodeGen/RISCV/rv32zbp.ll
llvm/test/CodeGen/RISCV/rv64zbp.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
llvm/test/CodeGen/X86/bitreverse.ll
llvm/test/CodeGen/X86/combine-bitreverse.ll
llvm/test/CodeGen/X86/pr43820.ll
llvm/test/CodeGen/X86/vector-bitreverse.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index accda2588c883..0cd15de4d6413 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7296,34 +7296,31 @@ SDValue TargetLowering::expandBITREVERSE(SDNode *N, SelectionDAG &DAG) const {
// TODO: We can easily support i4/i2 legal types if any target ever does.
if (Sz >= 8 && isPowerOf2_32(Sz)) {
// Create the masks - repeating the pattern every byte.
- APInt MaskHi4 = APInt::getSplat(Sz, APInt(8, 0xF0));
- APInt MaskHi2 = APInt::getSplat(Sz, APInt(8, 0xCC));
- APInt MaskHi1 = APInt::getSplat(Sz, APInt(8, 0xAA));
- APInt MaskLo4 = APInt::getSplat(Sz, APInt(8, 0x0F));
- APInt MaskLo2 = APInt::getSplat(Sz, APInt(8, 0x33));
- APInt MaskLo1 = APInt::getSplat(Sz, APInt(8, 0x55));
+ APInt Mask4 = APInt::getSplat(Sz, APInt(8, 0x0F));
+ APInt Mask2 = APInt::getSplat(Sz, APInt(8, 0x33));
+ APInt Mask1 = APInt::getSplat(Sz, APInt(8, 0x55));
// BSWAP if the type is wider than a single byte.
Tmp = (Sz > 8 ? DAG.getNode(ISD::BSWAP, dl, VT, Op) : Op);
- // swap i4: ((V & 0xF0) >> 4) | ((V & 0x0F) << 4)
- Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi4, dl, VT));
- Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo4, dl, VT));
- Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(4, dl, SHVT));
+ // swap i4: ((V >> 4) & 0x0F) | ((V & 0x0F) << 4)
+ Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(4, dl, SHVT));
+ Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask4, dl, VT));
+ Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask4, dl, VT));
Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, SHVT));
Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
- // swap i2: ((V & 0xCC) >> 2) | ((V & 0x33) << 2)
- Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi2, dl, VT));
- Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo2, dl, VT));
- Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(2, dl, SHVT));
+ // swap i2: ((V >> 2) & 0x33) | ((V & 0x33) << 2)
+ Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(2, dl, SHVT));
+ Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask2, dl, VT));
+ Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask2, dl, VT));
Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, SHVT));
Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
- // swap i1: ((V & 0xAA) >> 1) | ((V & 0x55) << 1)
- Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi1, dl, VT));
- Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo1, dl, VT));
- Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(1, dl, SHVT));
+ // swap i1: ((V >> 1) & 0x55) | ((V & 0x55) << 1)
+ Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(1, dl, SHVT));
+ Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask1, dl, VT));
+ Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask1, dl, VT));
Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, SHVT));
Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
return Tmp;
diff --git a/llvm/test/CodeGen/RISCV/rv32zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbp.ll
index cfad9fb9110a4..1717526a608cd 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbp.ll
@@ -2453,13 +2453,13 @@ define zeroext i8 @bitreverse_i8(i8 zeroext %a) nounwind {
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: andi a1, a0, 51
; RV32I-NEXT: slli a1, a1, 2
-; RV32I-NEXT: andi a0, a0, 204
; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: andi a0, a0, 51
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: andi a1, a0, 85
; RV32I-NEXT: slli a1, a1, 1
-; RV32I-NEXT: andi a0, a0, 170
; RV32I-NEXT: srli a0, a0, 1
+; RV32I-NEXT: andi a0, a0, 85
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: ret
;
@@ -2484,33 +2484,27 @@ define zeroext i16 @bitreverse_i16(i16 zeroext %a) nounwind {
; RV32I-NEXT: srli a1, a0, 8
; RV32I-NEXT: slli a0, a0, 8
; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: lui a1, 1
-; RV32I-NEXT: addi a1, a1, -241
-; RV32I-NEXT: and a1, a0, a1
-; RV32I-NEXT: slli a1, a1, 4
-; RV32I-NEXT: lui a2, 15
-; RV32I-NEXT: addi a2, a2, 240
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: lui a2, 1
+; RV32I-NEXT: addi a2, a2, -241
+; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: and a0, a0, a2
-; RV32I-NEXT: srli a0, a0, 4
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: lui a1, 3
-; RV32I-NEXT: addi a1, a1, 819
-; RV32I-NEXT: and a1, a0, a1
-; RV32I-NEXT: slli a1, a1, 2
-; RV32I-NEXT: lui a2, 13
-; RV32I-NEXT: addi a2, a2, -820
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 2
+; RV32I-NEXT: lui a2, 3
+; RV32I-NEXT: addi a2, a2, 819
+; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: and a0, a0, a2
-; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: lui a1, 5
-; RV32I-NEXT: addi a1, a1, 1365
-; RV32I-NEXT: and a1, a0, a1
-; RV32I-NEXT: slli a1, a1, 1
-; RV32I-NEXT: lui a2, 11
-; RV32I-NEXT: addi a2, a2, -1366
+; RV32I-NEXT: slli a0, a0, 2
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: lui a2, 5
+; RV32I-NEXT: addi a2, a2, 1365
+; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: and a0, a0, a2
-; RV32I-NEXT: srli a0, a0, 1
-; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: slli a0, a0, 1
+; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: ret
;
; RV32B-LABEL: bitreverse_i16:
@@ -2543,33 +2537,27 @@ define i32 @bitreverse_i32(i32 %a) nounwind {
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: lui a1, 61681
-; RV32I-NEXT: addi a1, a1, -241
-; RV32I-NEXT: and a1, a0, a1
-; RV32I-NEXT: slli a1, a1, 4
-; RV32I-NEXT: lui a2, 986895
-; RV32I-NEXT: addi a2, a2, 240
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: lui a2, 61681
+; RV32I-NEXT: addi a2, a2, -241
+; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: and a0, a0, a2
-; RV32I-NEXT: srli a0, a0, 4
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi a1, a1, 819
-; RV32I-NEXT: and a1, a0, a1
-; RV32I-NEXT: slli a1, a1, 2
-; RV32I-NEXT: lui a2, 838861
-; RV32I-NEXT: addi a2, a2, -820
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 2
+; RV32I-NEXT: lui a2, 209715
+; RV32I-NEXT: addi a2, a2, 819
+; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: and a0, a0, a2
-; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: lui a1, 349525
-; RV32I-NEXT: addi a1, a1, 1365
-; RV32I-NEXT: and a1, a0, a1
-; RV32I-NEXT: slli a1, a1, 1
-; RV32I-NEXT: lui a2, 699051
-; RV32I-NEXT: addi a2, a2, -1366
+; RV32I-NEXT: slli a0, a0, 2
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: lui a2, 349525
+; RV32I-NEXT: addi a2, a2, 1365
+; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: and a0, a0, a2
-; RV32I-NEXT: srli a0, a0, 1
-; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: slli a0, a0, 1
+; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: ret
;
; RV32B-LABEL: bitreverse_i32:
@@ -2602,58 +2590,52 @@ define i64 @bitreverse_i64(i64 %a) nounwind {
; RV32I-NEXT: slli a1, a1, 24
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: or a1, a1, a2
-; RV32I-NEXT: lui a2, 61681
-; RV32I-NEXT: addi t0, a2, -241
-; RV32I-NEXT: and a2, a1, t0
-; RV32I-NEXT: slli a2, a2, 4
-; RV32I-NEXT: lui a3, 986895
-; RV32I-NEXT: addi t1, a3, 240
-; RV32I-NEXT: and a1, a1, t1
-; RV32I-NEXT: srli a1, a1, 4
-; RV32I-NEXT: or a1, a1, a2
-; RV32I-NEXT: lui a2, 209715
-; RV32I-NEXT: addi t2, a2, 819
-; RV32I-NEXT: and a2, a1, t2
-; RV32I-NEXT: slli a2, a2, 2
-; RV32I-NEXT: lui a4, 838861
-; RV32I-NEXT: addi t3, a4, -820
-; RV32I-NEXT: and a1, a1, t3
-; RV32I-NEXT: srli a1, a1, 2
-; RV32I-NEXT: or a1, a1, a2
-; RV32I-NEXT: lui a2, 349525
-; RV32I-NEXT: addi a3, a2, 1365
-; RV32I-NEXT: and a2, a1, a3
-; RV32I-NEXT: slli a2, a2, 1
-; RV32I-NEXT: lui a5, 699051
-; RV32I-NEXT: addi a5, a5, -1366
+; RV32I-NEXT: srli a2, a1, 4
+; RV32I-NEXT: lui a4, 61681
+; RV32I-NEXT: addi a4, a4, -241
+; RV32I-NEXT: and a2, a2, a4
+; RV32I-NEXT: and a1, a1, a4
+; RV32I-NEXT: slli a1, a1, 4
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: srli a2, a1, 2
+; RV32I-NEXT: lui a3, 209715
+; RV32I-NEXT: addi a3, a3, 819
+; RV32I-NEXT: and a2, a2, a3
+; RV32I-NEXT: and a1, a1, a3
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: srli a2, a1, 1
+; RV32I-NEXT: lui a5, 349525
+; RV32I-NEXT: addi a5, a5, 1365
+; RV32I-NEXT: and a2, a2, a5
; RV32I-NEXT: and a1, a1, a5
-; RV32I-NEXT: srli a1, a1, 1
-; RV32I-NEXT: or a2, a1, a2
+; RV32I-NEXT: slli a1, a1, 1
+; RV32I-NEXT: or t0, a2, a1
; RV32I-NEXT: srli a1, a0, 8
; RV32I-NEXT: and a1, a1, a6
-; RV32I-NEXT: srli a4, a0, 24
-; RV32I-NEXT: or a1, a1, a4
-; RV32I-NEXT: slli a4, a0, 8
-; RV32I-NEXT: and a4, a4, a7
+; RV32I-NEXT: srli a2, a0, 24
+; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: slli a2, a0, 8
+; RV32I-NEXT: and a2, a2, a7
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, a4
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: and a1, a0, t0
-; RV32I-NEXT: slli a1, a1, 4
-; RV32I-NEXT: and a0, a0, t1
-; RV32I-NEXT: srli a0, a0, 4
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: and a1, a0, t2
-; RV32I-NEXT: slli a1, a1, 2
-; RV32I-NEXT: and a0, a0, t3
-; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: and a1, a0, a3
-; RV32I-NEXT: slli a1, a1, 1
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: and a1, a1, a4
+; RV32I-NEXT: and a0, a0, a4
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 2
+; RV32I-NEXT: and a1, a1, a3
+; RV32I-NEXT: and a0, a0, a3
+; RV32I-NEXT: slli a0, a0, 2
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: and a1, a1, a5
; RV32I-NEXT: and a0, a0, a5
-; RV32I-NEXT: srli a0, a0, 1
-; RV32I-NEXT: or a1, a0, a1
-; RV32I-NEXT: mv a0, a2
+; RV32I-NEXT: slli a0, a0, 1
+; RV32I-NEXT: or a1, a1, a0
+; RV32I-NEXT: mv a0, t0
; RV32I-NEXT: ret
;
; RV32B-LABEL: bitreverse_i64:
@@ -2756,33 +2738,27 @@ define i32 @bitreverse_bswap_i32(i32 %a) {
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: lui a1, 61681
-; RV32I-NEXT: addi a1, a1, -241
-; RV32I-NEXT: and a1, a0, a1
-; RV32I-NEXT: slli a1, a1, 4
-; RV32I-NEXT: lui a3, 986895
-; RV32I-NEXT: addi a3, a3, 240
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: lui a3, 61681
+; RV32I-NEXT: addi a3, a3, -241
+; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: and a0, a0, a3
-; RV32I-NEXT: srli a0, a0, 4
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi a1, a1, 819
-; RV32I-NEXT: and a1, a0, a1
-; RV32I-NEXT: slli a1, a1, 2
-; RV32I-NEXT: lui a3, 838861
-; RV32I-NEXT: addi a3, a3, -820
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 2
+; RV32I-NEXT: lui a3, 209715
+; RV32I-NEXT: addi a3, a3, 819
+; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: and a0, a0, a3
-; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: lui a1, 349525
-; RV32I-NEXT: addi a1, a1, 1365
-; RV32I-NEXT: and a1, a0, a1
-; RV32I-NEXT: slli a1, a1, 1
-; RV32I-NEXT: lui a3, 699051
-; RV32I-NEXT: addi a3, a3, -1366
+; RV32I-NEXT: slli a0, a0, 2
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: lui a3, 349525
+; RV32I-NEXT: addi a3, a3, 1365
+; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: and a0, a0, a3
-; RV32I-NEXT: srli a0, a0, 1
-; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: slli a0, a0, 1
+; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 8
; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: srli a2, a0, 24
@@ -2813,82 +2789,76 @@ define i64 @bitreverse_bswap_i64(i64 %a) {
; RV32I: # %bb.0:
; RV32I-NEXT: srli a3, a1, 8
; RV32I-NEXT: lui a2, 16
-; RV32I-NEXT: addi t0, a2, -256
-; RV32I-NEXT: and a3, a3, t0
+; RV32I-NEXT: addi a6, a2, -256
+; RV32I-NEXT: and a3, a3, a6
; RV32I-NEXT: srli a4, a1, 24
-; RV32I-NEXT: or a4, a3, a4
-; RV32I-NEXT: slli a5, a1, 8
-; RV32I-NEXT: lui t1, 4080
-; RV32I-NEXT: and a5, a5, t1
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a4, a1, 8
+; RV32I-NEXT: lui a7, 4080
+; RV32I-NEXT: and a4, a4, a7
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, a5
; RV32I-NEXT: or a1, a1, a4
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: srli a3, a1, 4
; RV32I-NEXT: lui a4, 61681
-; RV32I-NEXT: addi a6, a4, -241
-; RV32I-NEXT: and a5, a1, a6
-; RV32I-NEXT: slli a5, a5, 4
-; RV32I-NEXT: lui a4, 986895
-; RV32I-NEXT: addi a7, a4, 240
-; RV32I-NEXT: and a1, a1, a7
-; RV32I-NEXT: srli a1, a1, 4
-; RV32I-NEXT: or a1, a1, a5
-; RV32I-NEXT: lui a5, 209715
-; RV32I-NEXT: addi t2, a5, 819
-; RV32I-NEXT: and a4, a1, t2
-; RV32I-NEXT: slli a4, a4, 2
-; RV32I-NEXT: lui a2, 838861
-; RV32I-NEXT: addi t3, a2, -820
-; RV32I-NEXT: and a1, a1, t3
-; RV32I-NEXT: srli a1, a1, 2
-; RV32I-NEXT: or a1, a1, a4
-; RV32I-NEXT: lui a4, 349525
-; RV32I-NEXT: addi a4, a4, 1365
-; RV32I-NEXT: and a3, a1, a4
-; RV32I-NEXT: slli a3, a3, 1
-; RV32I-NEXT: lui a5, 699051
-; RV32I-NEXT: addi a5, a5, -1366
+; RV32I-NEXT: addi t0, a4, -241
+; RV32I-NEXT: and a3, a3, t0
+; RV32I-NEXT: and a1, a1, t0
+; RV32I-NEXT: slli a1, a1, 4
+; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: srli a3, a1, 2
+; RV32I-NEXT: lui a2, 209715
+; RV32I-NEXT: addi a2, a2, 819
+; RV32I-NEXT: and a3, a3, a2
+; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: srli a3, a1, 1
+; RV32I-NEXT: lui a5, 349525
+; RV32I-NEXT: addi a5, a5, 1365
+; RV32I-NEXT: and a3, a3, a5
; RV32I-NEXT: and a1, a1, a5
-; RV32I-NEXT: srli a1, a1, 1
-; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: slli a1, a1, 1
+; RV32I-NEXT: or a1, a3, a1
; RV32I-NEXT: srli a3, a0, 8
-; RV32I-NEXT: and a3, a3, t0
-; RV32I-NEXT: srli a2, a0, 24
-; RV32I-NEXT: or a2, a3, a2
-; RV32I-NEXT: slli a3, a0, 8
-; RV32I-NEXT: and a3, a3, t1
+; RV32I-NEXT: and a3, a3, a6
+; RV32I-NEXT: srli a4, a0, 24
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a4, a0, 8
+; RV32I-NEXT: and a4, a4, a7
; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, a4
; RV32I-NEXT: or a0, a0, a3
-; RV32I-NEXT: or a0, a0, a2
-; RV32I-NEXT: and a2, a0, a6
-; RV32I-NEXT: slli a2, a2, 4
-; RV32I-NEXT: and a0, a0, a7
-; RV32I-NEXT: srli a0, a0, 4
-; RV32I-NEXT: or a0, a0, a2
-; RV32I-NEXT: and a2, a0, t2
-; RV32I-NEXT: slli a2, a2, 2
-; RV32I-NEXT: and a0, a0, t3
-; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: or a0, a0, a2
-; RV32I-NEXT: and a2, a0, a4
-; RV32I-NEXT: slli a2, a2, 1
+; RV32I-NEXT: srli a3, a0, 4
+; RV32I-NEXT: and a3, a3, t0
+; RV32I-NEXT: and a0, a0, t0
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: srli a3, a0, 2
+; RV32I-NEXT: and a3, a3, a2
+; RV32I-NEXT: and a0, a0, a2
+; RV32I-NEXT: slli a0, a0, 2
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: srli a2, a0, 1
+; RV32I-NEXT: and a2, a2, a5
; RV32I-NEXT: and a0, a0, a5
-; RV32I-NEXT: srli a0, a0, 1
-; RV32I-NEXT: or a0, a0, a2
+; RV32I-NEXT: slli a0, a0, 1
+; RV32I-NEXT: or a0, a2, a0
; RV32I-NEXT: srli a2, a0, 8
-; RV32I-NEXT: and a2, a2, t0
+; RV32I-NEXT: and a2, a2, a6
; RV32I-NEXT: srli a3, a0, 24
; RV32I-NEXT: or a2, a2, a3
; RV32I-NEXT: slli a3, a0, 8
-; RV32I-NEXT: and a3, a3, t1
+; RV32I-NEXT: and a3, a3, a7
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: srli a2, a1, 8
-; RV32I-NEXT: and a2, a2, t0
+; RV32I-NEXT: and a2, a2, a6
; RV32I-NEXT: srli a3, a1, 24
; RV32I-NEXT: or a2, a2, a3
; RV32I-NEXT: slli a3, a1, 8
-; RV32I-NEXT: and a3, a3, t1
+; RV32I-NEXT: and a3, a3, a7
; RV32I-NEXT: slli a1, a1, 24
; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: or a1, a1, a2
diff --git a/llvm/test/CodeGen/RISCV/rv64zbp.ll b/llvm/test/CodeGen/RISCV/rv64zbp.ll
index c83698b70b763..4c35a53e61db6 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbp.ll
@@ -2816,13 +2816,13 @@ define zeroext i8 @bitreverse_i8(i8 zeroext %a) nounwind {
; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: andi a1, a0, 51
; RV64I-NEXT: slli a1, a1, 2
-; RV64I-NEXT: andi a0, a0, 204
; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: andi a0, a0, 51
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: andi a1, a0, 85
; RV64I-NEXT: slli a1, a1, 1
-; RV64I-NEXT: andi a0, a0, 170
; RV64I-NEXT: srli a0, a0, 1
+; RV64I-NEXT: andi a0, a0, 85
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: ret
;
@@ -2847,33 +2847,27 @@ define zeroext i16 @bitreverse_i16(i16 zeroext %a) nounwind {
; RV64I-NEXT: srli a1, a0, 8
; RV64I-NEXT: slli a0, a0, 8
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: lui a1, 1
-; RV64I-NEXT: addiw a1, a1, -241
-; RV64I-NEXT: and a1, a0, a1
-; RV64I-NEXT: slli a1, a1, 4
-; RV64I-NEXT: lui a2, 15
-; RV64I-NEXT: addiw a2, a2, 240
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: lui a2, 1
+; RV64I-NEXT: addiw a2, a2, -241
+; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: srli a0, a0, 4
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: lui a1, 3
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a1, a0, a1
-; RV64I-NEXT: slli a1, a1, 2
-; RV64I-NEXT: lui a2, 13
-; RV64I-NEXT: addiw a2, a2, -820
+; RV64I-NEXT: slli a0, a0, 4
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srli a1, a0, 2
+; RV64I-NEXT: lui a2, 3
+; RV64I-NEXT: addiw a2, a2, 819
+; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: lui a1, 5
-; RV64I-NEXT: addiw a1, a1, 1365
-; RV64I-NEXT: and a1, a0, a1
-; RV64I-NEXT: slli a1, a1, 1
-; RV64I-NEXT: lui a2, 11
-; RV64I-NEXT: addiw a2, a2, -1366
+; RV64I-NEXT: slli a0, a0, 2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srli a1, a0, 1
+; RV64I-NEXT: lui a2, 5
+; RV64I-NEXT: addiw a2, a2, 1365
+; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: srli a0, a0, 1
-; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a0, a0, 1
+; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: ret
;
; RV64B-LABEL: bitreverse_i16:
@@ -2906,35 +2900,27 @@ define signext i32 @bitreverse_i32(i32 signext %a) nounwind {
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: lui a1, 61681
-; RV64I-NEXT: addiw a1, a1, -241
-; RV64I-NEXT: and a1, a0, a1
-; RV64I-NEXT: slli a1, a1, 4
-; RV64I-NEXT: lui a2, 241
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: lui a2, 61681
; RV64I-NEXT: addiw a2, a2, -241
-; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, 240
+; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: srli a0, a0, 4
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a1, a0, a1
-; RV64I-NEXT: slli a1, a1, 2
-; RV64I-NEXT: lui a2, 838861
-; RV64I-NEXT: addiw a2, a2, -820
+; RV64I-NEXT: slli a0, a0, 4
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srli a1, a0, 2
+; RV64I-NEXT: lui a2, 209715
+; RV64I-NEXT: addiw a2, a2, 819
+; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: lui a1, 349525
-; RV64I-NEXT: addiw a1, a1, 1365
-; RV64I-NEXT: and a1, a0, a1
-; RV64I-NEXT: slli a1, a1, 1
-; RV64I-NEXT: lui a2, 699051
-; RV64I-NEXT: addiw a2, a2, -1366
+; RV64I-NEXT: slli a0, a0, 2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srli a1, a0, 1
+; RV64I-NEXT: lui a2, 349525
+; RV64I-NEXT: addiw a2, a2, 1365
+; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: srli a0, a0, 1
-; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a0, a0, 1
+; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: ret
;
@@ -2967,35 +2953,27 @@ define void @bitreverse_i32_nosext(i32 signext %a, i32* %x) nounwind {
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addiw a2, a2, -241
-; RV64I-NEXT: and a2, a0, a2
-; RV64I-NEXT: slli a2, a2, 4
-; RV64I-NEXT: lui a3, 241
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: lui a3, 61681
; RV64I-NEXT: addiw a3, a3, -241
-; RV64I-NEXT: slli a3, a3, 12
-; RV64I-NEXT: addi a3, a3, 240
+; RV64I-NEXT: and a2, a2, a3
; RV64I-NEXT: and a0, a0, a3
-; RV64I-NEXT: srli a0, a0, 4
-; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: lui a2, 209715
-; RV64I-NEXT: addiw a2, a2, 819
-; RV64I-NEXT: and a2, a0, a2
-; RV64I-NEXT: slli a2, a2, 2
-; RV64I-NEXT: lui a3, 838861
-; RV64I-NEXT: addiw a3, a3, -820
+; RV64I-NEXT: slli a0, a0, 4
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: srli a2, a0, 2
+; RV64I-NEXT: lui a3, 209715
+; RV64I-NEXT: addiw a3, a3, 819
+; RV64I-NEXT: and a2, a2, a3
; RV64I-NEXT: and a0, a0, a3
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: lui a2, 349525
-; RV64I-NEXT: addiw a2, a2, 1365
-; RV64I-NEXT: and a2, a0, a2
-; RV64I-NEXT: slli a2, a2, 1
-; RV64I-NEXT: lui a3, 699051
-; RV64I-NEXT: addiw a3, a3, -1366
+; RV64I-NEXT: slli a0, a0, 2
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: srli a2, a0, 1
+; RV64I-NEXT: lui a3, 349525
+; RV64I-NEXT: addiw a3, a3, 1365
+; RV64I-NEXT: and a2, a2, a3
; RV64I-NEXT: and a0, a0, a3
-; RV64I-NEXT: srli a0, a0, 1
-; RV64I-NEXT: or a0, a0, a2
+; RV64I-NEXT: slli a0, a0, 1
+; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: sw a0, 0(a1)
; RV64I-NEXT: ret
;
@@ -3049,69 +3027,45 @@ define i64 @bitreverse_i64(i64 %a) nounwind {
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: lui a1, 3855
-; RV64I-NEXT: addiw a1, a1, 241
-; RV64I-NEXT: slli a1, a1, 12
-; RV64I-NEXT: addi a1, a1, -241
-; RV64I-NEXT: slli a1, a1, 12
-; RV64I-NEXT: addi a1, a1, 241
-; RV64I-NEXT: slli a1, a1, 12
-; RV64I-NEXT: addi a1, a1, -241
-; RV64I-NEXT: and a1, a0, a1
-; RV64I-NEXT: slli a1, a1, 4
-; RV64I-NEXT: lui a2, 1044721
-; RV64I-NEXT: addiw a2, a2, -241
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: lui a2, 3855
+; RV64I-NEXT: addiw a2, a2, 241
+; RV64I-NEXT: slli a2, a2, 12
+; RV64I-NEXT: addi a2, a2, -241
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, 241
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, -241
-; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, 240
+; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: srli a0, a0, 4
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: lui a1, 13107
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a1, a1, 12
-; RV64I-NEXT: addi a1, a1, 819
-; RV64I-NEXT: slli a1, a1, 12
-; RV64I-NEXT: addi a1, a1, 819
-; RV64I-NEXT: slli a1, a1, 12
-; RV64I-NEXT: addi a1, a1, 819
-; RV64I-NEXT: and a1, a0, a1
-; RV64I-NEXT: slli a1, a1, 2
-; RV64I-NEXT: lui a2, 1035469
-; RV64I-NEXT: addiw a2, a2, -819
+; RV64I-NEXT: slli a0, a0, 4
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srli a1, a0, 2
+; RV64I-NEXT: lui a2, 13107
+; RV64I-NEXT: addiw a2, a2, 819
; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, -819
+; RV64I-NEXT: addi a2, a2, 819
; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, -819
+; RV64I-NEXT: addi a2, a2, 819
; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, -820
+; RV64I-NEXT: addi a2, a2, 819
+; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: lui a1, 21845
-; RV64I-NEXT: addiw a1, a1, 1365
-; RV64I-NEXT: slli a1, a1, 12
-; RV64I-NEXT: addi a1, a1, 1365
-; RV64I-NEXT: slli a1, a1, 12
-; RV64I-NEXT: addi a1, a1, 1365
-; RV64I-NEXT: slli a1, a1, 12
-; RV64I-NEXT: addi a1, a1, 1365
-; RV64I-NEXT: and a1, a0, a1
-; RV64I-NEXT: slli a1, a1, 1
-; RV64I-NEXT: lui a2, 1026731
-; RV64I-NEXT: addiw a2, a2, -1365
+; RV64I-NEXT: slli a0, a0, 2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srli a1, a0, 1
+; RV64I-NEXT: lui a2, 21845
+; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, -1365
+; RV64I-NEXT: addi a2, a2, 1365
; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, -1365
+; RV64I-NEXT: addi a2, a2, 1365
; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, -1366
+; RV64I-NEXT: addi a2, a2, 1365
+; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: srli a0, a0, 1
-; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a0, a0, 1
+; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: ret
;
; RV64B-LABEL: bitreverse_i64:
@@ -3210,35 +3164,27 @@ define i32 @bitreverse_bswap_i32(i32 %a) {
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: lui a1, 61681
-; RV64I-NEXT: addiw a1, a1, -241
-; RV64I-NEXT: and a1, a0, a1
-; RV64I-NEXT: slli a1, a1, 4
-; RV64I-NEXT: lui a3, 241
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: lui a3, 61681
; RV64I-NEXT: addiw a3, a3, -241
-; RV64I-NEXT: slli a3, a3, 12
-; RV64I-NEXT: addi a3, a3, 240
+; RV64I-NEXT: and a1, a1, a3
; RV64I-NEXT: and a0, a0, a3
-; RV64I-NEXT: srli a0, a0, 4
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a1, a0, a1
-; RV64I-NEXT: slli a1, a1, 2
-; RV64I-NEXT: lui a3, 838861
-; RV64I-NEXT: addiw a3, a3, -820
+; RV64I-NEXT: slli a0, a0, 4
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srli a1, a0, 2
+; RV64I-NEXT: lui a3, 209715
+; RV64I-NEXT: addiw a3, a3, 819
+; RV64I-NEXT: and a1, a1, a3
; RV64I-NEXT: and a0, a0, a3
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: lui a1, 349525
-; RV64I-NEXT: addiw a1, a1, 1365
-; RV64I-NEXT: and a1, a0, a1
-; RV64I-NEXT: slli a1, a1, 1
-; RV64I-NEXT: lui a3, 699051
-; RV64I-NEXT: addiw a3, a3, -1366
+; RV64I-NEXT: slli a0, a0, 2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srli a1, a0, 1
+; RV64I-NEXT: lui a3, 349525
+; RV64I-NEXT: addiw a3, a3, 1365
+; RV64I-NEXT: and a1, a1, a3
; RV64I-NEXT: and a0, a0, a3
-; RV64I-NEXT: srli a0, a0, 1
-; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a0, a0, 1
+; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: srli a1, a0, 8
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: srli a2, a0, 24
@@ -3267,14 +3213,14 @@ define i32 @bitreverse_bswap_i32(i32 %a) {
define i64 @bitreverse_bswap_i64(i64 %a) {
; RV64I-LABEL: bitreverse_bswap_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: srli a2, a0, 24
; RV64I-NEXT: lui a6, 4080
-; RV64I-NEXT: and a1, a1, a6
-; RV64I-NEXT: srli a3, a0, 8
+; RV64I-NEXT: and a3, a2, a6
+; RV64I-NEXT: srli a4, a0, 8
; RV64I-NEXT: addi a5, zero, 255
; RV64I-NEXT: slli a7, a5, 24
-; RV64I-NEXT: and a3, a3, a7
-; RV64I-NEXT: or a3, a3, a1
+; RV64I-NEXT: and a4, a4, a7
+; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: srli a4, a0, 40
; RV64I-NEXT: lui a1, 16
; RV64I-NEXT: addiw a1, a1, -256
@@ -3282,9 +3228,9 @@ define i64 @bitreverse_bswap_i64(i64 %a) {
; RV64I-NEXT: srli a2, a0, 56
; RV64I-NEXT: or a2, a4, a2
; RV64I-NEXT: or a2, a3, a2
-; RV64I-NEXT: slli a4, a0, 8
+; RV64I-NEXT: slli a3, a0, 8
; RV64I-NEXT: slli t0, a5, 32
-; RV64I-NEXT: and a3, a4, t0
+; RV64I-NEXT: and a3, a3, t0
; RV64I-NEXT: slli a4, a0, 24
; RV64I-NEXT: slli t1, a5, 40
; RV64I-NEXT: and a4, a4, t1
@@ -3296,69 +3242,45 @@ define i64 @bitreverse_bswap_i64(i64 %a) {
; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: lui a2, 3855
-; RV64I-NEXT: addiw a2, a2, 241
-; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, -241
-; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, 241
-; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, -241
-; RV64I-NEXT: and a2, a0, a2
-; RV64I-NEXT: slli a2, a2, 4
-; RV64I-NEXT: lui a3, 1044721
-; RV64I-NEXT: addiw a3, a3, -241
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: lui a3, 3855
+; RV64I-NEXT: addiw a3, a3, 241
+; RV64I-NEXT: slli a3, a3, 12
+; RV64I-NEXT: addi a3, a3, -241
; RV64I-NEXT: slli a3, a3, 12
; RV64I-NEXT: addi a3, a3, 241
; RV64I-NEXT: slli a3, a3, 12
; RV64I-NEXT: addi a3, a3, -241
-; RV64I-NEXT: slli a3, a3, 12
-; RV64I-NEXT: addi a3, a3, 240
+; RV64I-NEXT: and a2, a2, a3
; RV64I-NEXT: and a0, a0, a3
-; RV64I-NEXT: srli a0, a0, 4
-; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: lui a2, 13107
-; RV64I-NEXT: addiw a2, a2, 819
-; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: and a2, a0, a2
-; RV64I-NEXT: slli a2, a2, 2
-; RV64I-NEXT: lui a3, 1035469
-; RV64I-NEXT: addiw a3, a3, -819
+; RV64I-NEXT: slli a0, a0, 4
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: srli a2, a0, 2
+; RV64I-NEXT: lui a3, 13107
+; RV64I-NEXT: addiw a3, a3, 819
; RV64I-NEXT: slli a3, a3, 12
-; RV64I-NEXT: addi a3, a3, -819
+; RV64I-NEXT: addi a3, a3, 819
; RV64I-NEXT: slli a3, a3, 12
-; RV64I-NEXT: addi a3, a3, -819
+; RV64I-NEXT: addi a3, a3, 819
; RV64I-NEXT: slli a3, a3, 12
-; RV64I-NEXT: addi a3, a3, -820
+; RV64I-NEXT: addi a3, a3, 819
+; RV64I-NEXT: and a2, a2, a3
; RV64I-NEXT: and a0, a0, a3
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: lui a2, 21845
-; RV64I-NEXT: addiw a2, a2, 1365
-; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, 1365
-; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, 1365
-; RV64I-NEXT: slli a2, a2, 12
-; RV64I-NEXT: addi a2, a2, 1365
-; RV64I-NEXT: and a2, a0, a2
-; RV64I-NEXT: slli a2, a2, 1
-; RV64I-NEXT: lui a3, 1026731
-; RV64I-NEXT: addiw a3, a3, -1365
+; RV64I-NEXT: slli a0, a0, 2
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: srli a2, a0, 1
+; RV64I-NEXT: lui a3, 21845
+; RV64I-NEXT: addiw a3, a3, 1365
; RV64I-NEXT: slli a3, a3, 12
-; RV64I-NEXT: addi a3, a3, -1365
+; RV64I-NEXT: addi a3, a3, 1365
; RV64I-NEXT: slli a3, a3, 12
-; RV64I-NEXT: addi a3, a3, -1365
+; RV64I-NEXT: addi a3, a3, 1365
; RV64I-NEXT: slli a3, a3, 12
-; RV64I-NEXT: addi a3, a3, -1366
+; RV64I-NEXT: addi a3, a3, 1365
+; RV64I-NEXT: and a2, a2, a3
; RV64I-NEXT: and a0, a0, a3
-; RV64I-NEXT: srli a0, a0, 1
-; RV64I-NEXT: or a0, a0, a2
+; RV64I-NEXT: slli a0, a0, 1
+; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: srli a2, a0, 40
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: srli a2, a0, 56
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
index c1a9fe20aa93e..a3180f0b4e317 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
@@ -12,33 +12,27 @@ define void @bitreverse_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 8
; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 8
; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 4
; LMULMAX2-RV32-NEXT: lui a1, 1
; LMULMAX2-RV32-NEXT: addi a1, a1, -241
-; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 4
-; LMULMAX2-RV32-NEXT: lui a1, 15
-; LMULMAX2-RV32-NEXT: addi a1, a1, 240
+; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 4
+; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25
+; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 2
; LMULMAX2-RV32-NEXT: lui a1, 3
; LMULMAX2-RV32-NEXT: addi a1, a1, 819
-; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 2
-; LMULMAX2-RV32-NEXT: lui a1, 13
-; LMULMAX2-RV32-NEXT: addi a1, a1, -820
+; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 2
+; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25
+; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 1
; LMULMAX2-RV32-NEXT: lui a1, 5
; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v26
-; LMULMAX2-RV32-NEXT: lui a1, 11
-; LMULMAX2-RV32-NEXT: addi a1, a1, -1366
+; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT: vadd.vv v25, v25, v25
+; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25
; LMULMAX2-RV32-NEXT: vse16.v v25, (a0)
; LMULMAX2-RV32-NEXT: ret
;
@@ -49,33 +43,27 @@ define void @bitreverse_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 8
; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 8
; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 4
; LMULMAX2-RV64-NEXT: lui a1, 1
; LMULMAX2-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4
-; LMULMAX2-RV64-NEXT: lui a1, 15
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 240
+; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 4
+; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25
+; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 2
; LMULMAX2-RV64-NEXT: lui a1, 3
; LMULMAX2-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2
-; LMULMAX2-RV64-NEXT: lui a1, 13
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -820
+; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 2
+; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25
+; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 1
; LMULMAX2-RV64-NEXT: lui a1, 5
; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v26
-; LMULMAX2-RV64-NEXT: lui a1, 11
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -1366
+; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT: vadd.vv v25, v25, v25
+; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25
; LMULMAX2-RV64-NEXT: vse16.v v25, (a0)
; LMULMAX2-RV64-NEXT: ret
;
@@ -86,33 +74,27 @@ define void @bitreverse_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 8
; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 8
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 4
; LMULMAX1-RV32-NEXT: lui a1, 1
; LMULMAX1-RV32-NEXT: addi a1, a1, -241
-; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 4
-; LMULMAX1-RV32-NEXT: lui a1, 15
-; LMULMAX1-RV32-NEXT: addi a1, a1, 240
+; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 4
+; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 2
; LMULMAX1-RV32-NEXT: lui a1, 3
; LMULMAX1-RV32-NEXT: addi a1, a1, 819
-; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2
-; LMULMAX1-RV32-NEXT: lui a1, 13
-; LMULMAX1-RV32-NEXT: addi a1, a1, -820
+; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 2
+; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 1
; LMULMAX1-RV32-NEXT: lui a1, 5
; LMULMAX1-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v26
-; LMULMAX1-RV32-NEXT: lui a1, 11
-; LMULMAX1-RV32-NEXT: addi a1, a1, -1366
+; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v25
+; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25
; LMULMAX1-RV32-NEXT: vse16.v v25, (a0)
; LMULMAX1-RV32-NEXT: ret
;
@@ -123,33 +105,27 @@ define void @bitreverse_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 8
; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 8
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 4
; LMULMAX1-RV64-NEXT: lui a1, 1
; LMULMAX1-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4
-; LMULMAX1-RV64-NEXT: lui a1, 15
-; LMULMAX1-RV64-NEXT: addiw a1, a1, 240
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 4
+; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25
+; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 2
; LMULMAX1-RV64-NEXT: lui a1, 3
; LMULMAX1-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2
-; LMULMAX1-RV64-NEXT: lui a1, 13
-; LMULMAX1-RV64-NEXT: addiw a1, a1, -820
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 2
+; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25
+; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 1
; LMULMAX1-RV64-NEXT: lui a1, 5
; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v26
-; LMULMAX1-RV64-NEXT: lui a1, 11
-; LMULMAX1-RV64-NEXT: addiw a1, a1, -1366
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v25
+; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25
; LMULMAX1-RV64-NEXT: vse16.v v25, (a0)
; LMULMAX1-RV64-NEXT: ret
%a = load <8 x i16>, <8 x i16>* %x
@@ -177,33 +153,27 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 24
; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v27
; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 4
; LMULMAX2-RV32-NEXT: lui a1, 61681
; LMULMAX2-RV32-NEXT: addi a1, a1, -241
-; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 4
-; LMULMAX2-RV32-NEXT: lui a1, 986895
-; LMULMAX2-RV32-NEXT: addi a1, a1, 240
+; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 4
+; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25
+; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 2
; LMULMAX2-RV32-NEXT: lui a1, 209715
; LMULMAX2-RV32-NEXT: addi a1, a1, 819
-; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 2
-; LMULMAX2-RV32-NEXT: lui a1, 838861
-; LMULMAX2-RV32-NEXT: addi a1, a1, -820
+; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 2
+; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25
+; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 1
; LMULMAX2-RV32-NEXT: lui a1, 349525
; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v26
-; LMULMAX2-RV32-NEXT: lui a1, 699051
-; LMULMAX2-RV32-NEXT: addi a1, a1, -1366
+; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT: vadd.vv v25, v25, v25
+; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25
; LMULMAX2-RV32-NEXT: vse32.v v25, (a0)
; LMULMAX2-RV32-NEXT: ret
;
@@ -223,39 +193,27 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 24
; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v27
; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 4
; LMULMAX2-RV64-NEXT: lui a1, 61681
; LMULMAX2-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4
-; LMULMAX2-RV64-NEXT: lui a1, 241
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, 240
+; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 4
+; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25
+; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 2
; LMULMAX2-RV64-NEXT: lui a1, 209715
; LMULMAX2-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2
-; LMULMAX2-RV64-NEXT: lui a1, 205
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -819
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -820
+; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 2
+; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25
+; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 1
; LMULMAX2-RV64-NEXT: lui a1, 349525
; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v26
-; LMULMAX2-RV64-NEXT: lui a1, 171
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -1365
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -1366
+; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT: vadd.vv v25, v25, v25
+; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25
; LMULMAX2-RV64-NEXT: vse32.v v25, (a0)
; LMULMAX2-RV64-NEXT: ret
;
@@ -275,33 +233,27 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 24
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 4
; LMULMAX1-RV32-NEXT: lui a1, 61681
; LMULMAX1-RV32-NEXT: addi a1, a1, -241
-; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 4
-; LMULMAX1-RV32-NEXT: lui a1, 986895
-; LMULMAX1-RV32-NEXT: addi a1, a1, 240
+; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 4
+; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 2
; LMULMAX1-RV32-NEXT: lui a1, 209715
; LMULMAX1-RV32-NEXT: addi a1, a1, 819
-; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2
-; LMULMAX1-RV32-NEXT: lui a1, 838861
-; LMULMAX1-RV32-NEXT: addi a1, a1, -820
+; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 2
+; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 1
; LMULMAX1-RV32-NEXT: lui a1, 349525
; LMULMAX1-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v26
-; LMULMAX1-RV32-NEXT: lui a1, 699051
-; LMULMAX1-RV32-NEXT: addi a1, a1, -1366
+; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v25
+; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25
; LMULMAX1-RV32-NEXT: vse32.v v25, (a0)
; LMULMAX1-RV32-NEXT: ret
;
@@ -321,39 +273,27 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 24
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 4
; LMULMAX1-RV64-NEXT: lui a1, 61681
; LMULMAX1-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4
-; LMULMAX1-RV64-NEXT: lui a1, 241
-; LMULMAX1-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX1-RV64-NEXT: slli a1, a1, 12
-; LMULMAX1-RV64-NEXT: addi a1, a1, 240
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 4
+; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25
+; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 2
; LMULMAX1-RV64-NEXT: lui a1, 209715
; LMULMAX1-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2
-; LMULMAX1-RV64-NEXT: lui a1, 205
-; LMULMAX1-RV64-NEXT: addiw a1, a1, -819
-; LMULMAX1-RV64-NEXT: slli a1, a1, 12
-; LMULMAX1-RV64-NEXT: addi a1, a1, -820
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 2
+; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25
+; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 1
; LMULMAX1-RV64-NEXT: lui a1, 349525
; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v26
-; LMULMAX1-RV64-NEXT: lui a1, 171
-; LMULMAX1-RV64-NEXT: addiw a1, a1, -1365
-; LMULMAX1-RV64-NEXT: slli a1, a1, 12
-; LMULMAX1-RV64-NEXT: addi a1, a1, -1366
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v25
+; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25
; LMULMAX1-RV64-NEXT: vse32.v v25, (a0)
; LMULMAX1-RV64-NEXT: ret
%a = load <4 x i32>, <4 x i32>* %x
@@ -416,51 +356,36 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v28
; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v27
; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 4
; LMULMAX2-RV32-NEXT: lui a1, 61681
; LMULMAX2-RV32-NEXT: addi a1, a1, -241
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-RV32-NEXT: vmv.v.x v26, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX2-RV32-NEXT: vand.vv v26, v25, v26
-; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 4
-; LMULMAX2-RV32-NEXT: lui a1, 986895
-; LMULMAX2-RV32-NEXT: addi a1, a1, 240
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; LMULMAX2-RV32-NEXT: vmv.v.x v27, a1
; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v27
; LMULMAX2-RV32-NEXT: vand.vv v25, v25, v27
-; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 4
+; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25
+; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 2
; LMULMAX2-RV32-NEXT: lui a1, 209715
; LMULMAX2-RV32-NEXT: addi a1, a1, 819
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-RV32-NEXT: vmv.v.x v26, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX2-RV32-NEXT: vand.vv v26, v25, v26
-; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 2
-; LMULMAX2-RV32-NEXT: lui a1, 838861
-; LMULMAX2-RV32-NEXT: addi a1, a1, -820
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; LMULMAX2-RV32-NEXT: vmv.v.x v27, a1
; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v27
; LMULMAX2-RV32-NEXT: vand.vv v25, v25, v27
-; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 2
+; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25
+; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 1
; LMULMAX2-RV32-NEXT: lui a1, 349525
; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-RV32-NEXT: vmv.v.x v26, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX2-RV32-NEXT: vand.vv v26, v25, v26
-; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v26
-; LMULMAX2-RV32-NEXT: lui a1, 699051
-; LMULMAX2-RV32-NEXT: addi a1, a1, -1366
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; LMULMAX2-RV32-NEXT: vmv.v.x v27, a1
; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v27
; LMULMAX2-RV32-NEXT: vand.vv v25, v25, v27
-; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT: vadd.vv v25, v25, v25
+; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25
; LMULMAX2-RV32-NEXT: vse64.v v25, (a0)
; LMULMAX2-RV32-NEXT: ret
;
@@ -499,6 +424,7 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
; LMULMAX2-RV64-NEXT: vor.vv v25, v28, v25
; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v27
; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 4
; LMULMAX2-RV64-NEXT: lui a1, 3855
; LMULMAX2-RV64-NEXT: addiw a1, a1, 241
; LMULMAX2-RV64-NEXT: slli a1, a1, 12
@@ -507,19 +433,11 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
; LMULMAX2-RV64-NEXT: addi a1, a1, 241
; LMULMAX2-RV64-NEXT: slli a1, a1, 12
; LMULMAX2-RV64-NEXT: addi a1, a1, -241
-; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4
-; LMULMAX2-RV64-NEXT: lui a1, 1044721
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, 241
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -241
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, 240
+; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 4
+; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25
+; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 2
; LMULMAX2-RV64-NEXT: lui a1, 13107
; LMULMAX2-RV64-NEXT: addiw a1, a1, 819
; LMULMAX2-RV64-NEXT: slli a1, a1, 12
@@ -528,19 +446,11 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
; LMULMAX2-RV64-NEXT: addi a1, a1, 819
; LMULMAX2-RV64-NEXT: slli a1, a1, 12
; LMULMAX2-RV64-NEXT: addi a1, a1, 819
-; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2
-; LMULMAX2-RV64-NEXT: lui a1, 1035469
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -819
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -819
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -819
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -820
+; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 2
+; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25
+; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 1
; LMULMAX2-RV64-NEXT: lui a1, 21845
; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365
; LMULMAX2-RV64-NEXT: slli a1, a1, 12
@@ -549,19 +459,10 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
; LMULMAX2-RV64-NEXT: addi a1, a1, 1365
; LMULMAX2-RV64-NEXT: slli a1, a1, 12
; LMULMAX2-RV64-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v26
-; LMULMAX2-RV64-NEXT: lui a1, 1026731
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -1365
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -1365
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -1365
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -1366
+; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX2-RV64-NEXT: vadd.vv v25, v25, v25
+; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25
; LMULMAX2-RV64-NEXT: vse64.v v25, (a0)
; LMULMAX2-RV64-NEXT: ret
;
@@ -616,51 +517,36 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 4
; LMULMAX1-RV32-NEXT: lui a1, 61681
; LMULMAX1-RV32-NEXT: addi a1, a1, -241
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v26, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v26
-; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 4
-; LMULMAX1-RV32-NEXT: lui a1, 986895
-; LMULMAX1-RV32-NEXT: addi a1, a1, 240
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; LMULMAX1-RV32-NEXT: vmv.v.x v27, a1
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v27
; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 4
+; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 2
; LMULMAX1-RV32-NEXT: lui a1, 209715
; LMULMAX1-RV32-NEXT: addi a1, a1, 819
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v26, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v26
-; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2
-; LMULMAX1-RV32-NEXT: lui a1, 838861
-; LMULMAX1-RV32-NEXT: addi a1, a1, -820
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; LMULMAX1-RV32-NEXT: vmv.v.x v27, a1
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v27
; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 2
+; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 1
; LMULMAX1-RV32-NEXT: lui a1, 349525
; LMULMAX1-RV32-NEXT: addi a1, a1, 1365
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v26, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v26
-; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v26
-; LMULMAX1-RV32-NEXT: lui a1, 699051
-; LMULMAX1-RV32-NEXT: addi a1, a1, -1366
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; LMULMAX1-RV32-NEXT: vmv.v.x v27, a1
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v27
; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v25
+; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25
; LMULMAX1-RV32-NEXT: vse64.v v25, (a0)
; LMULMAX1-RV32-NEXT: ret
;
@@ -699,6 +585,7 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
; LMULMAX1-RV64-NEXT: vor.vv v25, v28, v25
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 4
; LMULMAX1-RV64-NEXT: lui a1, 3855
; LMULMAX1-RV64-NEXT: addiw a1, a1, 241
; LMULMAX1-RV64-NEXT: slli a1, a1, 12
@@ -707,19 +594,11 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
; LMULMAX1-RV64-NEXT: addi a1, a1, 241
; LMULMAX1-RV64-NEXT: slli a1, a1, 12
; LMULMAX1-RV64-NEXT: addi a1, a1, -241
-; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4
-; LMULMAX1-RV64-NEXT: lui a1, 1044721
-; LMULMAX1-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX1-RV64-NEXT: slli a1, a1, 12
-; LMULMAX1-RV64-NEXT: addi a1, a1, 241
-; LMULMAX1-RV64-NEXT: slli a1, a1, 12
-; LMULMAX1-RV64-NEXT: addi a1, a1, -241
-; LMULMAX1-RV64-NEXT: slli a1, a1, 12
-; LMULMAX1-RV64-NEXT: addi a1, a1, 240
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 4
+; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25
+; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 2
; LMULMAX1-RV64-NEXT: lui a1, 13107
; LMULMAX1-RV64-NEXT: addiw a1, a1, 819
; LMULMAX1-RV64-NEXT: slli a1, a1, 12
@@ -728,19 +607,11 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
; LMULMAX1-RV64-NEXT: addi a1, a1, 819
; LMULMAX1-RV64-NEXT: slli a1, a1, 12
; LMULMAX1-RV64-NEXT: addi a1, a1, 819
-; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2
-; LMULMAX1-RV64-NEXT: lui a1, 1035469
-; LMULMAX1-RV64-NEXT: addiw a1, a1, -819
-; LMULMAX1-RV64-NEXT: slli a1, a1, 12
-; LMULMAX1-RV64-NEXT: addi a1, a1, -819
-; LMULMAX1-RV64-NEXT: slli a1, a1, 12
-; LMULMAX1-RV64-NEXT: addi a1, a1, -819
-; LMULMAX1-RV64-NEXT: slli a1, a1, 12
-; LMULMAX1-RV64-NEXT: addi a1, a1, -820
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 2
+; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25
+; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 1
; LMULMAX1-RV64-NEXT: lui a1, 21845
; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365
; LMULMAX1-RV64-NEXT: slli a1, a1, 12
@@ -749,19 +620,10 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
; LMULMAX1-RV64-NEXT: addi a1, a1, 1365
; LMULMAX1-RV64-NEXT: slli a1, a1, 12
; LMULMAX1-RV64-NEXT: addi a1, a1, 1365
-; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1
-; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v26
-; LMULMAX1-RV64-NEXT: lui a1, 1026731
-; LMULMAX1-RV64-NEXT: addiw a1, a1, -1365
-; LMULMAX1-RV64-NEXT: slli a1, a1, 12
-; LMULMAX1-RV64-NEXT: addi a1, a1, -1365
-; LMULMAX1-RV64-NEXT: slli a1, a1, 12
-; LMULMAX1-RV64-NEXT: addi a1, a1, -1365
-; LMULMAX1-RV64-NEXT: slli a1, a1, 12
-; LMULMAX1-RV64-NEXT: addi a1, a1, -1366
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v25
+; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25
; LMULMAX1-RV64-NEXT: vse64.v v25, (a0)
; LMULMAX1-RV64-NEXT: ret
%a = load <2 x i64>, <2 x i64>* %x
@@ -780,33 +642,27 @@ define void @bitreverse_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 8
; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 8
; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 4
; LMULMAX2-RV32-NEXT: lui a1, 1
; LMULMAX2-RV32-NEXT: addi a1, a1, -241
-; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 4
-; LMULMAX2-RV32-NEXT: lui a1, 15
-; LMULMAX2-RV32-NEXT: addi a1, a1, 240
+; LMULMAX2-RV32-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 4
-; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 4
+; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26
+; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 2
; LMULMAX2-RV32-NEXT: lui a1, 3
; LMULMAX2-RV32-NEXT: addi a1, a1, 819
-; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 2
-; LMULMAX2-RV32-NEXT: lui a1, 13
-; LMULMAX2-RV32-NEXT: addi a1, a1, -820
+; LMULMAX2-RV32-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 2
-; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 2
+; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26
+; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 1
; LMULMAX2-RV32-NEXT: lui a1, 5
; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV32-NEXT: vadd.vv v28, v28, v28
-; LMULMAX2-RV32-NEXT: lui a1, 11
-; LMULMAX2-RV32-NEXT: addi a1, a1, -1366
+; LMULMAX2-RV32-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 1
-; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v26
+; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26
; LMULMAX2-RV32-NEXT: vse16.v v26, (a0)
; LMULMAX2-RV32-NEXT: ret
;
@@ -817,150 +673,132 @@ define void @bitreverse_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 8
; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 8
; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 4
; LMULMAX2-RV64-NEXT: lui a1, 1
; LMULMAX2-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 4
-; LMULMAX2-RV64-NEXT: lui a1, 15
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 240
+; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 4
-; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4
+; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26
+; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 2
; LMULMAX2-RV64-NEXT: lui a1, 3
; LMULMAX2-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 2
-; LMULMAX2-RV64-NEXT: lui a1, 13
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -820
+; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 2
-; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2
+; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26
+; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 1
; LMULMAX2-RV64-NEXT: lui a1, 5
; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT: vadd.vv v28, v28, v28
-; LMULMAX2-RV64-NEXT: lui a1, 11
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -1366
+; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 1
-; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v26
+; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26
; LMULMAX2-RV64-NEXT: vse16.v v26, (a0)
; LMULMAX2-RV64-NEXT: ret
;
; LMULMAX1-RV32-LABEL: bitreverse_v16i16:
; LMULMAX1-RV32: # %bb.0:
; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu
-; LMULMAX1-RV32-NEXT: addi a6, a0, 16
-; LMULMAX1-RV32-NEXT: vle16.v v25, (a6)
+; LMULMAX1-RV32-NEXT: addi a1, a0, 16
+; LMULMAX1-RV32-NEXT: vle16.v v25, (a1)
; LMULMAX1-RV32-NEXT: vle16.v v26, (a0)
; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 8
; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 8
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 4
; LMULMAX1-RV32-NEXT: lui a2, 1
-; LMULMAX1-RV32-NEXT: addi a7, a2, -241
-; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a7
-; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 4
-; LMULMAX1-RV32-NEXT: lui a3, 15
-; LMULMAX1-RV32-NEXT: addi a3, a3, 240
-; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a3
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT: lui a4, 3
-; LMULMAX1-RV32-NEXT: addi a4, a4, 819
-; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a4
-; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV32-NEXT: lui a5, 13
-; LMULMAX1-RV32-NEXT: addi a5, a5, -820
-; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a5
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT: lui a1, 5
-; LMULMAX1-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a1
-; LMULMAX1-RV32-NEXT: vadd.vv v27, v27, v27
-; LMULMAX1-RV32-NEXT: lui a2, 11
-; LMULMAX1-RV32-NEXT: addi a2, a2, -1366
+; LMULMAX1-RV32-NEXT: addi a2, a2, -241
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a2
; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a2
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 4
+; LMULMAX1-RV32-NEXT: vor.vv v25, v27, v25
+; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 2
+; LMULMAX1-RV32-NEXT: lui a3, 3
+; LMULMAX1-RV32-NEXT: addi a3, a3, 819
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a3
+; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a3
+; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 2
+; LMULMAX1-RV32-NEXT: vor.vv v25, v27, v25
+; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 1
+; LMULMAX1-RV32-NEXT: lui a4, 5
+; LMULMAX1-RV32-NEXT: addi a4, a4, 1365
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a4
+; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a4
+; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v25
+; LMULMAX1-RV32-NEXT: vor.vv v25, v27, v25
; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 8
; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 8
; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a7
-; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 4
-; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a3
-; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 4
-; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a4
-; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a5
-; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 2
-; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a1
-; LMULMAX1-RV32-NEXT: vadd.vv v27, v27, v27
+; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 4
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a2
; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a2
-; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 1
-; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27
+; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 4
+; LMULMAX1-RV32-NEXT: vor.vv v26, v27, v26
+; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 2
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a3
+; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a3
+; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2
+; LMULMAX1-RV32-NEXT: vor.vv v26, v27, v26
+; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 1
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a4
+; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a4
+; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v26
+; LMULMAX1-RV32-NEXT: vor.vv v26, v27, v26
; LMULMAX1-RV32-NEXT: vse16.v v26, (a0)
-; LMULMAX1-RV32-NEXT: vse16.v v25, (a6)
+; LMULMAX1-RV32-NEXT: vse16.v v25, (a1)
; LMULMAX1-RV32-NEXT: ret
;
; LMULMAX1-RV64-LABEL: bitreverse_v16i16:
; LMULMAX1-RV64: # %bb.0:
; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu
-; LMULMAX1-RV64-NEXT: addi a6, a0, 16
-; LMULMAX1-RV64-NEXT: vle16.v v25, (a6)
+; LMULMAX1-RV64-NEXT: addi a1, a0, 16
+; LMULMAX1-RV64-NEXT: vle16.v v25, (a1)
; LMULMAX1-RV64-NEXT: vle16.v v26, (a0)
; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 8
; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 8
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 4
; LMULMAX1-RV64-NEXT: lui a2, 1
-; LMULMAX1-RV64-NEXT: addiw a7, a2, -241
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a7
-; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4
-; LMULMAX1-RV64-NEXT: lui a3, 15
-; LMULMAX1-RV64-NEXT: addiw a3, a3, 240
-; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT: lui a4, 3
-; LMULMAX1-RV64-NEXT: addiw a4, a4, 819
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a4
-; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT: lui a5, 13
-; LMULMAX1-RV64-NEXT: addiw a5, a5, -820
-; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a5
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT: lui a1, 5
-; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a1
-; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27
-; LMULMAX1-RV64-NEXT: lui a2, 11
-; LMULMAX1-RV64-NEXT: addiw a2, a2, -1366
+; LMULMAX1-RV64-NEXT: addiw a2, a2, -241
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a2
; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a2
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
+; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 4
+; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 2
+; LMULMAX1-RV64-NEXT: lui a3, 3
+; LMULMAX1-RV64-NEXT: addiw a3, a3, 819
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a3
+; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3
+; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 2
+; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 1
+; LMULMAX1-RV64-NEXT: lui a4, 5
+; LMULMAX1-RV64-NEXT: addiw a4, a4, 1365
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4
+; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a4
+; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v25
+; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25
; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 8
; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 8
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a7
-; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4
-; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3
-; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 4
-; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4
-; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5
-; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 2
-; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a1
-; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 4
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a2
; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a2
-; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 1
-; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
+; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4
+; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 2
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a3
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3
+; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2
+; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 1
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a4
+; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v26
+; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26
; LMULMAX1-RV64-NEXT: vse16.v v26, (a0)
-; LMULMAX1-RV64-NEXT: vse16.v v25, (a6)
+; LMULMAX1-RV64-NEXT: vse16.v v25, (a1)
; LMULMAX1-RV64-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %x
%b = load <16 x i16>, <16 x i16>* %y
@@ -987,33 +825,27 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 24
; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v30
; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 4
; LMULMAX2-RV32-NEXT: lui a1, 61681
; LMULMAX2-RV32-NEXT: addi a1, a1, -241
-; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 4
-; LMULMAX2-RV32-NEXT: lui a1, 986895
-; LMULMAX2-RV32-NEXT: addi a1, a1, 240
+; LMULMAX2-RV32-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 4
-; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 4
+; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26
+; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 2
; LMULMAX2-RV32-NEXT: lui a1, 209715
; LMULMAX2-RV32-NEXT: addi a1, a1, 819
-; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 2
-; LMULMAX2-RV32-NEXT: lui a1, 838861
-; LMULMAX2-RV32-NEXT: addi a1, a1, -820
+; LMULMAX2-RV32-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 2
-; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 2
+; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26
+; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 1
; LMULMAX2-RV32-NEXT: lui a1, 349525
; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV32-NEXT: vadd.vv v28, v28, v28
-; LMULMAX2-RV32-NEXT: lui a1, 699051
-; LMULMAX2-RV32-NEXT: addi a1, a1, -1366
+; LMULMAX2-RV32-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 1
-; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v26
+; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26
; LMULMAX2-RV32-NEXT: vse32.v v26, (a0)
; LMULMAX2-RV32-NEXT: ret
;
@@ -1033,39 +865,27 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 24
; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v30
; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 4
; LMULMAX2-RV64-NEXT: lui a1, 61681
; LMULMAX2-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 4
-; LMULMAX2-RV64-NEXT: lui a1, 241
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, 240
+; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 4
-; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4
+; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26
+; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 2
; LMULMAX2-RV64-NEXT: lui a1, 209715
; LMULMAX2-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 2
-; LMULMAX2-RV64-NEXT: lui a1, 205
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -819
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -820
+; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 2
-; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2
+; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26
+; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 1
; LMULMAX2-RV64-NEXT: lui a1, 349525
; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT: vadd.vv v28, v28, v28
-; LMULMAX2-RV64-NEXT: lui a1, 171
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -1365
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -1366
+; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 1
-; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v26
+; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26
; LMULMAX2-RV64-NEXT: vse32.v v26, (a0)
; LMULMAX2-RV64-NEXT: ret
;
@@ -1077,67 +897,61 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
; LMULMAX1-RV32-NEXT: vle32.v v26, (a0)
; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 8
; LMULMAX1-RV32-NEXT: lui a2, 16
-; LMULMAX1-RV32-NEXT: addi a7, a2, -256
-; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a7
+; LMULMAX1-RV32-NEXT: addi a2, a2, -256
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a2
; LMULMAX1-RV32-NEXT: vsrl.vi v28, v25, 24
; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v28
; LMULMAX1-RV32-NEXT: vsll.vi v28, v25, 8
-; LMULMAX1-RV32-NEXT: lui t0, 4080
-; LMULMAX1-RV32-NEXT: vand.vx v28, v28, t0
+; LMULMAX1-RV32-NEXT: lui a3, 4080
+; LMULMAX1-RV32-NEXT: vand.vx v28, v28, a3
; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 24
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 4
; LMULMAX1-RV32-NEXT: lui a4, 61681
-; LMULMAX1-RV32-NEXT: addi t1, a4, -241
-; LMULMAX1-RV32-NEXT: vand.vx v27, v25, t1
-; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 4
-; LMULMAX1-RV32-NEXT: lui a5, 986895
-; LMULMAX1-RV32-NEXT: addi a5, a5, 240
-; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a5
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT: lui a1, 209715
-; LMULMAX1-RV32-NEXT: addi a1, a1, 819
-; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a1
-; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV32-NEXT: lui a2, 838861
-; LMULMAX1-RV32-NEXT: addi a2, a2, -820
-; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a2
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT: lui a3, 349525
-; LMULMAX1-RV32-NEXT: addi a3, a3, 1365
-; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a3
-; LMULMAX1-RV32-NEXT: vadd.vv v27, v27, v27
-; LMULMAX1-RV32-NEXT: lui a4, 699051
-; LMULMAX1-RV32-NEXT: addi a4, a4, -1366
+; LMULMAX1-RV32-NEXT: addi a4, a4, -241
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a4
; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a4
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 4
+; LMULMAX1-RV32-NEXT: vor.vv v25, v27, v25
+; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 2
+; LMULMAX1-RV32-NEXT: lui a5, 209715
+; LMULMAX1-RV32-NEXT: addi a5, a5, 819
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a5
+; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a5
+; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 2
+; LMULMAX1-RV32-NEXT: vor.vv v25, v27, v25
+; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 1
+; LMULMAX1-RV32-NEXT: lui a1, 349525
+; LMULMAX1-RV32-NEXT: addi a1, a1, 1365
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a1
+; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1
+; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v25
+; LMULMAX1-RV32-NEXT: vor.vv v25, v27, v25
; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 8
-; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a7
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a2
; LMULMAX1-RV32-NEXT: vsrl.vi v28, v26, 24
; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v28
; LMULMAX1-RV32-NEXT: vsll.vi v28, v26, 8
-; LMULMAX1-RV32-NEXT: vand.vx v28, v28, t0
+; LMULMAX1-RV32-NEXT: vand.vx v28, v28, a3
; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 24
; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v28
; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT: vand.vx v27, v26, t1
-; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 4
-; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a5
-; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 4
-; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a1
-; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a2
-; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 2
-; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a3
-; LMULMAX1-RV32-NEXT: vadd.vv v27, v27, v27
+; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 4
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a4
; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a4
-; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 1
-; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27
+; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 4
+; LMULMAX1-RV32-NEXT: vor.vv v26, v27, v26
+; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 2
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a5
+; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a5
+; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2
+; LMULMAX1-RV32-NEXT: vor.vv v26, v27, v26
+; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 1
+; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a1
+; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1
+; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v26
+; LMULMAX1-RV32-NEXT: vor.vv v26, v27, v26
; LMULMAX1-RV32-NEXT: vse32.v v26, (a0)
; LMULMAX1-RV32-NEXT: vse32.v v25, (a6)
; LMULMAX1-RV32-NEXT: ret
@@ -1155,68 +969,56 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
; LMULMAX1-RV64-NEXT: vsrl.vi v28, v25, 24
; LMULMAX1-RV64-NEXT: vor.vv v27, v27, v28
; LMULMAX1-RV64-NEXT: vsll.vi v28, v25, 8
-; LMULMAX1-RV64-NEXT: lui a7, 4080
-; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7
+; LMULMAX1-RV64-NEXT: lui a3, 4080
+; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a3
; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 24
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 4
; LMULMAX1-RV64-NEXT: lui a4, 61681
; LMULMAX1-RV64-NEXT: addiw a4, a4, -241
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a4
-; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4
-; LMULMAX1-RV64-NEXT: lui a5, 241
-; LMULMAX1-RV64-NEXT: addiw a5, a5, -241
-; LMULMAX1-RV64-NEXT: slli a5, a5, 12
-; LMULMAX1-RV64-NEXT: addi t0, a5, 240
-; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t0
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT: lui a1, 209715
-; LMULMAX1-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a1
-; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT: lui a3, 205
-; LMULMAX1-RV64-NEXT: addiw a3, a3, -819
-; LMULMAX1-RV64-NEXT: slli a3, a3, 12
-; LMULMAX1-RV64-NEXT: addi t1, a3, -820
-; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t1
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT: lui a5, 349525
-; LMULMAX1-RV64-NEXT: addiw a5, a5, 1365
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a5
-; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27
-; LMULMAX1-RV64-NEXT: lui a3, 171
-; LMULMAX1-RV64-NEXT: addiw a3, a3, -1365
-; LMULMAX1-RV64-NEXT: slli a3, a3, 12
-; LMULMAX1-RV64-NEXT: addi a3, a3, -1366
-; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4
+; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a4
+; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 4
+; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 2
+; LMULMAX1-RV64-NEXT: lui a5, 209715
+; LMULMAX1-RV64-NEXT: addiw a5, a5, 819
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a5
+; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a5
+; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 2
+; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 1
+; LMULMAX1-RV64-NEXT: lui a1, 349525
+; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a1
+; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1
+; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v25
+; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25
; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 8
; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a2
; LMULMAX1-RV64-NEXT: vsrl.vi v28, v26, 24
; LMULMAX1-RV64-NEXT: vor.vv v27, v27, v28
; LMULMAX1-RV64-NEXT: vsll.vi v28, v26, 8
-; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7
+; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a3
; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 24
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v28
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4
-; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4
-; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t0
-; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 4
-; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a1
-; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t1
-; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 2
-; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a5
-; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27
-; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3
-; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 1
-; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 4
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a4
+; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4
+; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 2
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a5
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5
+; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2
+; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 1
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a1
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1
+; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v26
+; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26
; LMULMAX1-RV64-NEXT: vse32.v v26, (a0)
; LMULMAX1-RV64-NEXT: vse32.v v25, (a6)
; LMULMAX1-RV64-NEXT: ret
@@ -1280,51 +1082,36 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v8
; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v30
; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 4
; LMULMAX2-RV32-NEXT: lui a1, 61681
; LMULMAX2-RV32-NEXT: addi a1, a1, -241
; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; LMULMAX2-RV32-NEXT: vmv.v.x v28, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
-; LMULMAX2-RV32-NEXT: vand.vv v28, v26, v28
-; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 4
-; LMULMAX2-RV32-NEXT: lui a1, 986895
-; LMULMAX2-RV32-NEXT: addi a1, a1, 240
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
; LMULMAX2-RV32-NEXT: vmv.v.x v30, a1
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX2-RV32-NEXT: vand.vv v28, v28, v30
; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v30
-; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 4
-; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 4
+; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26
+; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 2
; LMULMAX2-RV32-NEXT: lui a1, 209715
; LMULMAX2-RV32-NEXT: addi a1, a1, 819
; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; LMULMAX2-RV32-NEXT: vmv.v.x v28, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
-; LMULMAX2-RV32-NEXT: vand.vv v28, v26, v28
-; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 2
-; LMULMAX2-RV32-NEXT: lui a1, 838861
-; LMULMAX2-RV32-NEXT: addi a1, a1, -820
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
; LMULMAX2-RV32-NEXT: vmv.v.x v30, a1
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX2-RV32-NEXT: vand.vv v28, v28, v30
; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v30
-; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 2
-; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 2
+; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26
+; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 1
; LMULMAX2-RV32-NEXT: lui a1, 349525
; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; LMULMAX2-RV32-NEXT: vmv.v.x v28, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
-; LMULMAX2-RV32-NEXT: vand.vv v28, v26, v28
-; LMULMAX2-RV32-NEXT: vadd.vv v28, v28, v28
-; LMULMAX2-RV32-NEXT: lui a1, 699051
-; LMULMAX2-RV32-NEXT: addi a1, a1, -1366
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
; LMULMAX2-RV32-NEXT: vmv.v.x v30, a1
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX2-RV32-NEXT: vand.vv v28, v28, v30
; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v30
-; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 1
-; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v26
+; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26
; LMULMAX2-RV32-NEXT: vse64.v v26, (a0)
; LMULMAX2-RV32-NEXT: ret
;
@@ -1363,6 +1150,7 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX2-RV64-NEXT: vor.vv v26, v8, v26
; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v30
; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 4
; LMULMAX2-RV64-NEXT: lui a1, 3855
; LMULMAX2-RV64-NEXT: addiw a1, a1, 241
; LMULMAX2-RV64-NEXT: slli a1, a1, 12
@@ -1371,19 +1159,11 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX2-RV64-NEXT: addi a1, a1, 241
; LMULMAX2-RV64-NEXT: slli a1, a1, 12
; LMULMAX2-RV64-NEXT: addi a1, a1, -241
-; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 4
-; LMULMAX2-RV64-NEXT: lui a1, 1044721
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, 241
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -241
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, 240
+; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 4
-; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4
+; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26
+; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 2
; LMULMAX2-RV64-NEXT: lui a1, 13107
; LMULMAX2-RV64-NEXT: addiw a1, a1, 819
; LMULMAX2-RV64-NEXT: slli a1, a1, 12
@@ -1392,19 +1172,11 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX2-RV64-NEXT: addi a1, a1, 819
; LMULMAX2-RV64-NEXT: slli a1, a1, 12
; LMULMAX2-RV64-NEXT: addi a1, a1, 819
-; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 2
-; LMULMAX2-RV64-NEXT: lui a1, 1035469
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -819
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -819
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -819
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -820
+; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 2
-; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2
+; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26
+; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 1
; LMULMAX2-RV64-NEXT: lui a1, 21845
; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365
; LMULMAX2-RV64-NEXT: slli a1, a1, 12
@@ -1413,19 +1185,10 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX2-RV64-NEXT: addi a1, a1, 1365
; LMULMAX2-RV64-NEXT: slli a1, a1, 12
; LMULMAX2-RV64-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1
-; LMULMAX2-RV64-NEXT: vadd.vv v28, v28, v28
-; LMULMAX2-RV64-NEXT: lui a1, 1026731
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -1365
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -1365
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -1365
-; LMULMAX2-RV64-NEXT: slli a1, a1, 12
-; LMULMAX2-RV64-NEXT: addi a1, a1, -1366
+; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1
; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 1
-; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v26
+; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26
; LMULMAX2-RV64-NEXT: vse64.v v26, (a0)
; LMULMAX2-RV64-NEXT: ret
;
@@ -1433,17 +1196,17 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX1-RV32: # %bb.0:
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; LMULMAX1-RV32-NEXT: addi a6, a0, 16
-; LMULMAX1-RV32-NEXT: vle64.v v30, (a6)
+; LMULMAX1-RV32-NEXT: vle64.v v29, (a6)
; LMULMAX1-RV32-NEXT: vle64.v v25, (a0)
; LMULMAX1-RV32-NEXT: addi a2, zero, 56
-; LMULMAX1-RV32-NEXT: vsrl.vx v26, v30, a2
+; LMULMAX1-RV32-NEXT: vsrl.vx v26, v29, a2
; LMULMAX1-RV32-NEXT: addi a3, zero, 40
-; LMULMAX1-RV32-NEXT: vsrl.vx v27, v30, a3
+; LMULMAX1-RV32-NEXT: vsrl.vx v27, v29, a3
; LMULMAX1-RV32-NEXT: lui a4, 16
; LMULMAX1-RV32-NEXT: addi a4, a4, -256
; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a4
; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v26
-; LMULMAX1-RV32-NEXT: vsrl.vi v26, v30, 24
+; LMULMAX1-RV32-NEXT: vsrl.vi v26, v29, 24
; LMULMAX1-RV32-NEXT: lui a5, 4080
; LMULMAX1-RV32-NEXT: vand.vx v28, v26, a5
; LMULMAX1-RV32-NEXT: addi a1, zero, 5
@@ -1454,125 +1217,106 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX1-RV32-NEXT: lui a1, 1044480
; LMULMAX1-RV32-NEXT: vmerge.vxm v26, v26, a1, v0
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vsrl.vi v29, v30, 8
-; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v26
-; LMULMAX1-RV32-NEXT: vor.vv v28, v29, v28
-; LMULMAX1-RV32-NEXT: vor.vv v31, v28, v27
+; LMULMAX1-RV32-NEXT: vsrl.vi v30, v29, 8
+; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v26
+; LMULMAX1-RV32-NEXT: vor.vv v28, v30, v28
+; LMULMAX1-RV32-NEXT: vor.vv v30, v28, v27
; LMULMAX1-RV32-NEXT: addi a1, zero, 255
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; LMULMAX1-RV32-NEXT: vmv.v.x v27, a1
; LMULMAX1-RV32-NEXT: vmerge.vim v27, v27, 0, v0
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vsll.vi v28, v30, 8
-; LMULMAX1-RV32-NEXT: vand.vv v29, v28, v27
+; LMULMAX1-RV32-NEXT: vsll.vi v28, v29, 8
+; LMULMAX1-RV32-NEXT: vand.vv v31, v28, v27
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; LMULMAX1-RV32-NEXT: vmv.v.x v28, a4
; LMULMAX1-RV32-NEXT: vmerge.vim v28, v28, 0, v0
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vsll.vi v8, v30, 24
+; LMULMAX1-RV32-NEXT: vsll.vi v8, v29, 24
; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v28
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v29
-; LMULMAX1-RV32-NEXT: vsll.vx v9, v30, a3
+; LMULMAX1-RV32-NEXT: vor.vv v31, v8, v31
+; LMULMAX1-RV32-NEXT: vsll.vx v8, v29, a3
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v29, a5
-; LMULMAX1-RV32-NEXT: vmerge.vim v29, v29, 0, v0
+; LMULMAX1-RV32-NEXT: vmv.v.x v9, a5
+; LMULMAX1-RV32-NEXT: vmerge.vim v9, v9, 0, v0
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v29
-; LMULMAX1-RV32-NEXT: vsll.vx v30, v30, a2
-; LMULMAX1-RV32-NEXT: vor.vv v30, v30, v9
-; LMULMAX1-RV32-NEXT: vor.vv v30, v30, v8
-; LMULMAX1-RV32-NEXT: vor.vv v31, v30, v31
+; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9
+; LMULMAX1-RV32-NEXT: vsll.vx v29, v29, a2
+; LMULMAX1-RV32-NEXT: vor.vv v29, v29, v8
+; LMULMAX1-RV32-NEXT: vor.vv v29, v29, v31
+; LMULMAX1-RV32-NEXT: vor.vv v29, v29, v30
+; LMULMAX1-RV32-NEXT: vsrl.vi v30, v29, 4
; LMULMAX1-RV32-NEXT: lui a1, 61681
; LMULMAX1-RV32-NEXT: addi a1, a1, -241
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v30, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vand.vv v8, v31, v30
-; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 4
-; LMULMAX1-RV32-NEXT: lui a1, 986895
-; LMULMAX1-RV32-NEXT: addi a1, a1, 240
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1
+; LMULMAX1-RV32-NEXT: vmv.v.x v31, a1
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vand.vv v31, v31, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v31, v31, 4
-; LMULMAX1-RV32-NEXT: vor.vv v31, v31, v8
+; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v31
+; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v31
+; LMULMAX1-RV32-NEXT: vsll.vi v29, v29, 4
+; LMULMAX1-RV32-NEXT: vor.vv v29, v30, v29
+; LMULMAX1-RV32-NEXT: vsrl.vi v30, v29, 2
; LMULMAX1-RV32-NEXT: lui a1, 209715
; LMULMAX1-RV32-NEXT: addi a1, a1, 819
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; LMULMAX1-RV32-NEXT: vmv.v.x v8, a1
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vand.vv v10, v31, v8
-; LMULMAX1-RV32-NEXT: vsll.vi v10, v10, 2
-; LMULMAX1-RV32-NEXT: lui a1, 838861
-; LMULMAX1-RV32-NEXT: addi a1, a1, -820
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v11, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vand.vv v31, v31, v11
-; LMULMAX1-RV32-NEXT: vsrl.vi v31, v31, 2
-; LMULMAX1-RV32-NEXT: vor.vv v31, v31, v10
+; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v8
+; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v8
+; LMULMAX1-RV32-NEXT: vsll.vi v29, v29, 2
+; LMULMAX1-RV32-NEXT: vor.vv v29, v30, v29
+; LMULMAX1-RV32-NEXT: vsrl.vi v30, v29, 1
; LMULMAX1-RV32-NEXT: lui a1, 349525
; LMULMAX1-RV32-NEXT: addi a1, a1, 1365
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; LMULMAX1-RV32-NEXT: vmv.v.x v10, a1
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vand.vv v12, v31, v10
-; LMULMAX1-RV32-NEXT: vadd.vv v12, v12, v12
-; LMULMAX1-RV32-NEXT: lui a1, 699051
-; LMULMAX1-RV32-NEXT: addi a1, a1, -1366
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vmv.v.x v13, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV32-NEXT: vand.vv v31, v31, v13
-; LMULMAX1-RV32-NEXT: vsrl.vi v31, v31, 1
-; LMULMAX1-RV32-NEXT: vor.vv v31, v31, v12
-; LMULMAX1-RV32-NEXT: vsrl.vx v12, v25, a2
-; LMULMAX1-RV32-NEXT: vsrl.vx v14, v25, a3
-; LMULMAX1-RV32-NEXT: vand.vx v14, v14, a4
-; LMULMAX1-RV32-NEXT: vor.vv v12, v14, v12
-; LMULMAX1-RV32-NEXT: vsrl.vi v14, v25, 24
-; LMULMAX1-RV32-NEXT: vand.vx v14, v14, a5
-; LMULMAX1-RV32-NEXT: vsrl.vi v15, v25, 8
-; LMULMAX1-RV32-NEXT: vand.vv v26, v15, v26
-; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v14
-; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v12
-; LMULMAX1-RV32-NEXT: vsll.vi v12, v25, 8
-; LMULMAX1-RV32-NEXT: vand.vv v27, v12, v27
-; LMULMAX1-RV32-NEXT: vsll.vi v12, v25, 24
-; LMULMAX1-RV32-NEXT: vand.vv v28, v12, v28
+; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v10
+; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v10
+; LMULMAX1-RV32-NEXT: vadd.vv v29, v29, v29
+; LMULMAX1-RV32-NEXT: vor.vv v29, v30, v29
+; LMULMAX1-RV32-NEXT: vsrl.vx v30, v25, a2
+; LMULMAX1-RV32-NEXT: vsrl.vx v11, v25, a3
+; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a4
+; LMULMAX1-RV32-NEXT: vor.vv v30, v11, v30
+; LMULMAX1-RV32-NEXT: vsrl.vi v11, v25, 24
+; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a5
+; LMULMAX1-RV32-NEXT: vsrl.vi v12, v25, 8
+; LMULMAX1-RV32-NEXT: vand.vv v26, v12, v26
+; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v11
+; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v30
+; LMULMAX1-RV32-NEXT: vsll.vi v30, v25, 8
+; LMULMAX1-RV32-NEXT: vand.vv v27, v30, v27
+; LMULMAX1-RV32-NEXT: vsll.vi v30, v25, 24
+; LMULMAX1-RV32-NEXT: vand.vv v28, v30, v28
; LMULMAX1-RV32-NEXT: vor.vv v27, v28, v27
; LMULMAX1-RV32-NEXT: vsll.vx v28, v25, a3
-; LMULMAX1-RV32-NEXT: vand.vv v28, v28, v29
+; LMULMAX1-RV32-NEXT: vand.vv v28, v28, v9
; LMULMAX1-RV32-NEXT: vsll.vx v25, v25, a2
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27
; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
-; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v30
-; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 4
-; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
-; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v8
-; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2
-; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v11
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
-; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v10
-; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v26
-; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v13
-; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 4
+; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v31
+; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v31
+; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 4
+; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 2
+; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v8
+; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v8
+; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 2
+; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 1
+; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v10
+; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v10
+; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v25
+; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25
; LMULMAX1-RV32-NEXT: vse64.v v25, (a0)
-; LMULMAX1-RV32-NEXT: vse64.v v31, (a6)
+; LMULMAX1-RV32-NEXT: vse64.v v29, (a6)
; LMULMAX1-RV32-NEXT: ret
;
; LMULMAX1-RV64-LABEL: bitreverse_v4i64:
; LMULMAX1-RV64: # %bb.0:
-; LMULMAX1-RV64-NEXT: addi sp, sp, -16
-; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16
-; LMULMAX1-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
-; LMULMAX1-RV64-NEXT: .cfi_offset s0, -8
; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; LMULMAX1-RV64-NEXT: addi a6, a0, 16
; LMULMAX1-RV64-NEXT: vle64.v v26, (a6)
@@ -1581,33 +1325,34 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX1-RV64-NEXT: vsrl.vx v27, v26, t0
; LMULMAX1-RV64-NEXT: addi t1, zero, 40
; LMULMAX1-RV64-NEXT: vsrl.vx v28, v26, t1
-; LMULMAX1-RV64-NEXT: lui a1, 16
-; LMULMAX1-RV64-NEXT: addiw t4, a1, -256
-; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4
+; LMULMAX1-RV64-NEXT: lui a4, 16
+; LMULMAX1-RV64-NEXT: addiw t2, a4, -256
+; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t2
; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27
; LMULMAX1-RV64-NEXT: vsrl.vi v28, v26, 24
; LMULMAX1-RV64-NEXT: lui a7, 4080
; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7
; LMULMAX1-RV64-NEXT: vsrl.vi v29, v26, 8
-; LMULMAX1-RV64-NEXT: addi a3, zero, 255
-; LMULMAX1-RV64-NEXT: slli a1, a3, 24
-; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a1
+; LMULMAX1-RV64-NEXT: addi a1, zero, 255
+; LMULMAX1-RV64-NEXT: slli t4, a1, 24
+; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t4
; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28
; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27
; LMULMAX1-RV64-NEXT: vsll.vi v28, v26, 8
-; LMULMAX1-RV64-NEXT: slli a5, a3, 32
-; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a5
+; LMULMAX1-RV64-NEXT: slli a2, a1, 32
+; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a2
; LMULMAX1-RV64-NEXT: vsll.vi v29, v26, 24
-; LMULMAX1-RV64-NEXT: slli a2, a3, 40
-; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a2
+; LMULMAX1-RV64-NEXT: slli a3, a1, 40
+; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a3
; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28
; LMULMAX1-RV64-NEXT: vsll.vx v29, v26, t0
; LMULMAX1-RV64-NEXT: vsll.vx v26, v26, t1
-; LMULMAX1-RV64-NEXT: slli a3, a3, 48
-; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3
+; LMULMAX1-RV64-NEXT: slli a1, a1, 48
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1
; LMULMAX1-RV64-NEXT: vor.vv v26, v29, v26
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v28
; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 4
; LMULMAX1-RV64-NEXT: lui a4, 3855
; LMULMAX1-RV64-NEXT: addiw a4, a4, 241
; LMULMAX1-RV64-NEXT: slli a4, a4, 12
@@ -1615,20 +1360,12 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX1-RV64-NEXT: slli a4, a4, 12
; LMULMAX1-RV64-NEXT: addi a4, a4, 241
; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi t2, a4, -241
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, t2
-; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4
-; LMULMAX1-RV64-NEXT: lui a4, 1044721
-; LMULMAX1-RV64-NEXT: addiw a4, a4, -241
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, 241
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, -241
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi t3, a4, 240
+; LMULMAX1-RV64-NEXT: addi t3, a4, -241
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, t3
; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t3
-; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 4
-; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
+; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4
+; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 2
; LMULMAX1-RV64-NEXT: lui a4, 13107
; LMULMAX1-RV64-NEXT: addiw a4, a4, 819
; LMULMAX1-RV64-NEXT: slli a4, a4, 12
@@ -1636,81 +1373,62 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX1-RV64-NEXT: slli a4, a4, 12
; LMULMAX1-RV64-NEXT: addi a4, a4, 819
; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi t5, a4, 819
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, t5
-; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT: lui a4, 1035469
-; LMULMAX1-RV64-NEXT: addiw a4, a4, -819
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, -819
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, -819
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi t6, a4, -820
-; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t6
-; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 2
-; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
-; LMULMAX1-RV64-NEXT: lui a4, 21845
-; LMULMAX1-RV64-NEXT: addiw a4, a4, 1365
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, 1365
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, 1365
-; LMULMAX1-RV64-NEXT: slli a4, a4, 12
-; LMULMAX1-RV64-NEXT: addi a4, a4, 1365
-; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4
-; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27
-; LMULMAX1-RV64-NEXT: lui s0, 1026731
-; LMULMAX1-RV64-NEXT: addiw s0, s0, -1365
-; LMULMAX1-RV64-NEXT: slli s0, s0, 12
-; LMULMAX1-RV64-NEXT: addi s0, s0, -1365
-; LMULMAX1-RV64-NEXT: slli s0, s0, 12
-; LMULMAX1-RV64-NEXT: addi s0, s0, -1365
-; LMULMAX1-RV64-NEXT: slli s0, s0, 12
-; LMULMAX1-RV64-NEXT: addi s0, s0, -1366
-; LMULMAX1-RV64-NEXT: vand.vx v26, v26, s0
-; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 1
-; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27
+; LMULMAX1-RV64-NEXT: addi a4, a4, 819
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a4
+; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2
+; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 1
+; LMULMAX1-RV64-NEXT: lui a5, 21845
+; LMULMAX1-RV64-NEXT: addiw a5, a5, 1365
+; LMULMAX1-RV64-NEXT: slli a5, a5, 12
+; LMULMAX1-RV64-NEXT: addi a5, a5, 1365
+; LMULMAX1-RV64-NEXT: slli a5, a5, 12
+; LMULMAX1-RV64-NEXT: addi a5, a5, 1365
+; LMULMAX1-RV64-NEXT: slli a5, a5, 12
+; LMULMAX1-RV64-NEXT: addi a5, a5, 1365
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a5
+; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5
+; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v26
+; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26
; LMULMAX1-RV64-NEXT: vsrl.vx v27, v25, t0
; LMULMAX1-RV64-NEXT: vsrl.vx v28, v25, t1
-; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4
+; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t2
; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27
; LMULMAX1-RV64-NEXT: vsrl.vi v28, v25, 24
; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7
; LMULMAX1-RV64-NEXT: vsrl.vi v29, v25, 8
-; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a1
+; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t4
; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28
; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27
; LMULMAX1-RV64-NEXT: vsll.vi v28, v25, 8
-; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a5
+; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a2
; LMULMAX1-RV64-NEXT: vsll.vi v29, v25, 24
-; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a2
+; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a3
; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28
; LMULMAX1-RV64-NEXT: vsll.vx v29, v25, t0
; LMULMAX1-RV64-NEXT: vsll.vx v25, v25, t1
-; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3
+; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1
; LMULMAX1-RV64-NEXT: vor.vv v25, v29, v25
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28
; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, t2
-; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 4
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, t3
; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t3
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, t5
-; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2
-; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t6
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
-; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a4
-; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27
-; LMULMAX1-RV64-NEXT: vand.vx v25, v25, s0
-; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1
-; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27
+; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 4
+; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 2
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4
+; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a4
+; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 2
+; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25
+; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 1
+; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a5
+; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a5
+; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v25
+; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25
; LMULMAX1-RV64-NEXT: vse64.v v25, (a0)
; LMULMAX1-RV64-NEXT: vse64.v v26, (a6)
-; LMULMAX1-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
-; LMULMAX1-RV64-NEXT: addi sp, sp, 16
; LMULMAX1-RV64-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %x
%b = load <4 x i64>, <4 x i64>* %y
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index b1c4cbead6c15..cfdbbce7f1f56 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -17,35 +17,35 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
; X86-NEXT: movl %eax, %edx
; X86-NEXT: andl $3855, %edx # imm = 0xF0F
; X86-NEXT: shll $4, %edx
-; X86-NEXT: andl $61680, %eax # imm = 0xF0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $3855, %eax # imm = 0xF0F
; X86-NEXT: orl %edx, %eax
; X86-NEXT: movl %eax, %edx
; X86-NEXT: andl $13107, %edx # imm = 0x3333
-; X86-NEXT: andl $52428, %eax # imm = 0xCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $13107, %eax # imm = 0x3333
; X86-NEXT: leal (%eax,%edx,4), %eax
; X86-NEXT: movl %eax, %edx
; X86-NEXT: andl $21845, %edx # imm = 0x5555
-; X86-NEXT: andl $43690, %eax # imm = 0xAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $21845, %eax # imm = 0x5555
; X86-NEXT: leal (%eax,%edx,2), %eax
; X86-NEXT: rolw $8, %cx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: andl $3855, %edx # imm = 0xF0F
; X86-NEXT: shll $4, %edx
-; X86-NEXT: andl $61680, %ecx # imm = 0xF0F0
; X86-NEXT: shrl $4, %ecx
+; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
; X86-NEXT: orl %edx, %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: andl $13107, %edx # imm = 0x3333
-; X86-NEXT: andl $52428, %ecx # imm = 0xCCCC
; X86-NEXT: shrl $2, %ecx
+; X86-NEXT: andl $13107, %ecx # imm = 0x3333
; X86-NEXT: leal (%ecx,%edx,4), %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: andl $21845, %edx # imm = 0x5555
-; X86-NEXT: andl $43690, %ecx # imm = 0xAAAA
; X86-NEXT: shrl %ecx
+; X86-NEXT: andl $21845, %ecx # imm = 0x5555
; X86-NEXT: leal (%ecx,%edx,2), %edx
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: # kill: def $dx killed $dx killed $edx
@@ -63,16 +63,18 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
; X64-NEXT: psrlw $4, %xmm0
; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: por %xmm1, %xmm0
-; X64-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; X64-NEXT: pand %xmm0, %xmm1
-; X64-NEXT: psllw $2, %xmm1
-; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT: psrlw $2, %xmm0
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: psrlw $2, %xmm1
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X64-NEXT: pand %xmm2, %xmm1
+; X64-NEXT: pand %xmm2, %xmm0
+; X64-NEXT: psllw $2, %xmm0
; X64-NEXT: por %xmm1, %xmm0
-; X64-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; X64-NEXT: pand %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psrlw $1, %xmm1
-; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; X64-NEXT: pand %xmm2, %xmm1
+; X64-NEXT: pand %xmm2, %xmm0
; X64-NEXT: paddb %xmm0, %xmm0
; X64-NEXT: por %xmm1, %xmm0
; X64-NEXT: retq
@@ -96,60 +98,60 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
; X86-NEXT: movl %eax, %edx
; X86-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %edx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %edx, %eax
; X86-NEXT: movl %eax, %edx
; X86-NEXT: andl $858993459, %edx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%edx,4), %eax
; X86-NEXT: movl %eax, %edx
; X86-NEXT: andl $1431655765, %edx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X86-NEXT: leal (%eax,%edx,2), %eax
; X86-NEXT: bswapl %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %edx
-; X86-NEXT: andl $-252645136, %ecx # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %ecx
+; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: orl %edx, %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: andl $858993459, %edx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %ecx # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %ecx
+; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
; X86-NEXT: leal (%ecx,%edx,4), %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: andl $1431655765, %edx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %ecx # imm = 0xAAAAAAAA
; X86-NEXT: shrl %ecx
+; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
; X86-NEXT: leal (%ecx,%edx,2), %edx
; X86-NEXT: retl
;
; X64-LABEL: test_bitreverse_i64:
; X64: # %bb.0:
; X64-NEXT: bswapq %rdi
-; X64-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT: andq %rdi, %rax
-; X64-NEXT: shlq $4, %rax
-; X64-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; X64-NEXT: andq %rdi, %rcx
-; X64-NEXT: shrq $4, %rcx
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq $4, %rax
+; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; X64-NEXT: andq %rcx, %rax
-; X64-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; X64-NEXT: andq %rcx, %rdx
-; X64-NEXT: shrq $2, %rdx
-; X64-NEXT: leaq (%rdx,%rax,4), %rax
-; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-NEXT: andq %rcx, %rdi
+; X64-NEXT: shlq $4, %rdi
+; X64-NEXT: orq %rax, %rdi
+; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-NEXT: movq %rdi, %rcx
; X64-NEXT: andq %rax, %rcx
-; X64-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; X64-NEXT: andq %rax, %rdx
-; X64-NEXT: shrq %rdx
-; X64-NEXT: leaq (%rdx,%rcx,2), %rax
+; X64-NEXT: shrq $2, %rdi
+; X64-NEXT: andq %rax, %rdi
+; X64-NEXT: leaq (%rdi,%rcx,4), %rax
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-NEXT: movq %rax, %rdx
+; X64-NEXT: andq %rcx, %rdx
+; X64-NEXT: shrq %rax
+; X64-NEXT: andq %rcx, %rax
+; X64-NEXT: leaq (%rax,%rdx,2), %rax
; X64-NEXT: retq
;
; X86XOP-LABEL: test_bitreverse_i64:
@@ -173,18 +175,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: retl
;
@@ -195,18 +197,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X64-NEXT: shll $4, %eax
-; X64-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0
; X64-NEXT: shrl $4, %edi
+; X64-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; X64-NEXT: orl %eax, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC
; X64-NEXT: shrl $2, %edi
+; X64-NEXT: andl $858993459, %edi # imm = 0x33333333
; X64-NEXT: leal (%rdi,%rax,4), %eax
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; X64-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X64-NEXT: shrl %eax
+; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X64-NEXT: leal (%rax,%rcx,2), %eax
; X64-NEXT: retq
;
@@ -230,18 +232,18 @@ define i24 @test_bitreverse_i24(i24 %a) nounwind {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $1431655680, %ecx # imm = 0x55555500
-; X86-NEXT: andl $-1431655936, %eax # imm = 0xAAAAAA00
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655680, %eax # imm = 0x55555500
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: shrl $8, %eax
; X86-NEXT: retl
@@ -253,18 +255,18 @@ define i24 @test_bitreverse_i24(i24 %a) nounwind {
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X64-NEXT: shll $4, %eax
-; X64-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0
; X64-NEXT: shrl $4, %edi
+; X64-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; X64-NEXT: orl %eax, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC
; X64-NEXT: shrl $2, %edi
+; X64-NEXT: andl $858993459, %edi # imm = 0x33333333
; X64-NEXT: leal (%rdi,%rax,4), %eax
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: andl $1431655680, %ecx # imm = 0x55555500
-; X64-NEXT: andl $-1431655936, %eax # imm = 0xAAAAAA00
; X64-NEXT: shrl %eax
+; X64-NEXT: andl $1431655680, %eax # imm = 0x55555500
; X64-NEXT: leal (%rax,%rcx,2), %eax
; X64-NEXT: shrl $8, %eax
; X64-NEXT: retq
@@ -290,18 +292,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $61680, %eax # imm = 0xF0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $3855, %eax # imm = 0xF0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $13107, %ecx # imm = 0x3333
-; X86-NEXT: andl $52428, %eax # imm = 0xCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $13107, %eax # imm = 0x3333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $21845, %ecx # imm = 0x5555
-; X86-NEXT: andl $43690, %eax # imm = 0xAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $21845, %eax # imm = 0x5555
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
@@ -313,18 +315,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $3855, %eax # imm = 0xF0F
; X64-NEXT: shll $4, %eax
-; X64-NEXT: andl $61680, %edi # imm = 0xF0F0
; X64-NEXT: shrl $4, %edi
+; X64-NEXT: andl $3855, %edi # imm = 0xF0F
; X64-NEXT: orl %eax, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $13107, %eax # imm = 0x3333
-; X64-NEXT: andl $52428, %edi # imm = 0xCCCC
; X64-NEXT: shrl $2, %edi
+; X64-NEXT: andl $13107, %edi # imm = 0x3333
; X64-NEXT: leal (%rdi,%rax,4), %eax
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: andl $21845, %ecx # imm = 0x5555
-; X64-NEXT: andl $43690, %eax # imm = 0xAAAA
; X64-NEXT: shrl %eax
+; X64-NEXT: andl $21845, %eax # imm = 0x5555
; X64-NEXT: leal (%rax,%rcx,2), %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
@@ -350,14 +352,14 @@ define i8 @test_bitreverse_i8(i8 %a) {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andb $51, %cl
; X86-NEXT: shlb $2, %cl
-; X86-NEXT: andb $-52, %al
; X86-NEXT: shrb $2, %al
+; X86-NEXT: andb $51, %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andb $85, %cl
; X86-NEXT: addb %cl, %cl
-; X86-NEXT: andb $-86, %al
; X86-NEXT: shrb %al
+; X86-NEXT: andb $85, %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: retl
;
@@ -368,14 +370,14 @@ define i8 @test_bitreverse_i8(i8 %a) {
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $51, %al
; X64-NEXT: shlb $2, %al
-; X64-NEXT: andb $-52, %dil
; X64-NEXT: shrb $2, %dil
+; X64-NEXT: andb $51, %dil
; X64-NEXT: orb %al, %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $85, %al
; X64-NEXT: addb %al, %al
-; X64-NEXT: andb $-86, %dil
; X64-NEXT: shrb %dil
+; X64-NEXT: andb $85, %dil
; X64-NEXT: addl %edi, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
@@ -401,14 +403,14 @@ define i4 @test_bitreverse_i4(i4 %a) {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andb $51, %cl
; X86-NEXT: shlb $2, %cl
-; X86-NEXT: andb $-52, %al
; X86-NEXT: shrb $2, %al
+; X86-NEXT: andb $51, %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andb $80, %cl
; X86-NEXT: addb %cl, %cl
-; X86-NEXT: andb $-96, %al
; X86-NEXT: shrb %al
+; X86-NEXT: andb $80, %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: shrb $4, %al
; X86-NEXT: retl
@@ -420,14 +422,14 @@ define i4 @test_bitreverse_i4(i4 %a) {
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $51, %al
; X64-NEXT: shlb $2, %al
-; X64-NEXT: andb $-52, %dil
; X64-NEXT: shrb $2, %dil
+; X64-NEXT: andb $51, %dil
; X64-NEXT: orb %al, %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $80, %al
; X64-NEXT: addb %al, %al
-; X64-NEXT: andb $-96, %dil
; X64-NEXT: shrb %dil
+; X64-NEXT: andb $80, %dil
; X64-NEXT: addl %edi, %eax
; X64-NEXT: shrb $4, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
@@ -621,107 +623,107 @@ define i528 @large_promotion(i528 %A) nounwind {
; X86-NEXT: movl %ebx, %ebp
; X86-NEXT: andl $252645135, %ebp # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ebp
-; X86-NEXT: andl $-252645136, %ebx # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %ebx
+; X86-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F
; X86-NEXT: orl %ebp, %ebx
; X86-NEXT: movl %ebx, %ebp
; X86-NEXT: andl $858993459, %ebp # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %ebx # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %ebx
+; X86-NEXT: andl $858993459, %ebx # imm = 0x33333333
; X86-NEXT: leal (%ebx,%ebp,4), %ebx
; X86-NEXT: movl %ebx, %ebp
; X86-NEXT: andl $1431633920, %ebp # imm = 0x55550000
-; X86-NEXT: andl $-1431699456, %ebx # imm = 0xAAAA0000
; X86-NEXT: shrl %ebx
+; X86-NEXT: andl $1431633920, %ebx # imm = 0x55550000
; X86-NEXT: leal (%ebx,%ebp,2), %ebx
; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
; X86-NEXT: bswapl %edi
; X86-NEXT: movl %edi, %ebx
; X86-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ebx
-; X86-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %edi
+; X86-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: movl %edi, %ebx
; X86-NEXT: andl $858993459, %ebx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %edi
+; X86-NEXT: andl $858993459, %edi # imm = 0x33333333
; X86-NEXT: leal (%edi,%ebx,4), %edi
; X86-NEXT: movl %edi, %ebx
; X86-NEXT: andl $1431655765, %ebx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %edi # imm = 0xAAAAAAAA
; X86-NEXT: shrl %edi
+; X86-NEXT: andl $1431655765, %edi # imm = 0x55555555
; X86-NEXT: leal (%edi,%ebx,2), %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: bswapl %esi
; X86-NEXT: movl %esi, %edi
; X86-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %edi
-; X86-NEXT: andl $-252645136, %esi # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %esi
+; X86-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F
; X86-NEXT: orl %edi, %esi
; X86-NEXT: movl %esi, %edi
; X86-NEXT: andl $858993459, %edi # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %esi # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %esi
+; X86-NEXT: andl $858993459, %esi # imm = 0x33333333
; X86-NEXT: leal (%esi,%edi,4), %esi
; X86-NEXT: movl %esi, %edi
; X86-NEXT: andl $1431655765, %edi # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %esi # imm = 0xAAAAAAAA
; X86-NEXT: shrl %esi
+; X86-NEXT: andl $1431655765, %esi # imm = 0x55555555
; X86-NEXT: leal (%esi,%edi,2), %ebx
; X86-NEXT: bswapl %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %esi
-; X86-NEXT: andl $-252645136, %edx # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %edx
+; X86-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
; X86-NEXT: orl %esi, %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: andl $858993459, %esi # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %edx # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %edx
+; X86-NEXT: andl $858993459, %edx # imm = 0x33333333
; X86-NEXT: leal (%edx,%esi,4), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: andl $1431655765, %esi # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %edx # imm = 0xAAAAAAAA
; X86-NEXT: shrl %edx
+; X86-NEXT: andl $1431655765, %edx # imm = 0x55555555
; X86-NEXT: leal (%edx,%esi,2), %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: bswapl %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %edx
-; X86-NEXT: andl $-252645136, %ecx # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %ecx
+; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: orl %edx, %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: andl $858993459, %edx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %ecx # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %ecx
+; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
; X86-NEXT: leal (%ecx,%edx,4), %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: andl $1431655765, %edx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %ecx # imm = 0xAAAAAAAA
; X86-NEXT: shrl %ecx
+; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
; X86-NEXT: leal (%ecx,%edx,2), %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: bswapl %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -729,18 +731,18 @@ define i528 @large_promotion(i528 %A) nounwind {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -748,18 +750,18 @@ define i528 @large_promotion(i528 %A) nounwind {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -767,18 +769,18 @@ define i528 @large_promotion(i528 %A) nounwind {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -786,18 +788,18 @@ define i528 @large_promotion(i528 %A) nounwind {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -805,18 +807,18 @@ define i528 @large_promotion(i528 %A) nounwind {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -824,18 +826,18 @@ define i528 @large_promotion(i528 %A) nounwind {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -843,18 +845,18 @@ define i528 @large_promotion(i528 %A) nounwind {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -862,18 +864,18 @@ define i528 @large_promotion(i528 %A) nounwind {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -881,18 +883,18 @@ define i528 @large_promotion(i528 %A) nounwind {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -900,36 +902,36 @@ define i528 @large_promotion(i528 %A) nounwind {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X86-NEXT: leal (%eax,%ecx,2), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: bswapl %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X86-NEXT: shll $4, %ecx
-; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0
; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC
; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X86-NEXT: leal (%eax,%ecx,2), %edx
; X86-NEXT: movl (%esp), %esi # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -1018,194 +1020,186 @@ define i528 @large_promotion(i528 %A) nounwind {
; X64-NEXT: pushq %r13
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %rdi, %r12
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15
; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; X64-NEXT: bswapq %rbx
-; X64-NEXT: movabsq $1085102592571150095, %r13 # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT: movq %rbx, %r10
-; X64-NEXT: andq %r13, %r10
-; X64-NEXT: shlq $4, %r10
-; X64-NEXT: movabsq $-1085102592571150096, %rax # imm = 0xF0F0F0F0F0F0F0F0
-; X64-NEXT: andq %rax, %rbx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: bswapq %rdi
+; X64-NEXT: movq %rdi, %rbx
; X64-NEXT: shrq $4, %rbx
-; X64-NEXT: orq %r10, %rbx
+; X64-NEXT: movabsq $1085102592571150095, %r13 # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT: andq %r13, %rbx
+; X64-NEXT: andq %r13, %rdi
+; X64-NEXT: shlq $4, %rdi
+; X64-NEXT: orq %rbx, %rdi
; X64-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333
-; X64-NEXT: movq %rbx, %r10
-; X64-NEXT: andq %r11, %r10
-; X64-NEXT: movabsq $-3689348814741910324, %r14 # imm = 0xCCCCCCCCCCCCCCCC
-; X64-NEXT: andq %r14, %rbx
-; X64-NEXT: shrq $2, %rbx
-; X64-NEXT: leaq (%rbx,%r10,4), %r10
-; X64-NEXT: movabsq $6148820866244280320, %rbx # imm = 0x5555000000000000
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: andq %r11, %rbx
+; X64-NEXT: shrq $2, %rdi
+; X64-NEXT: andq %r11, %rdi
+; X64-NEXT: leaq (%rdi,%rbx,4), %rdi
+; X64-NEXT: movabsq $6148820866244280320, %r10 # imm = 0x5555000000000000
+; X64-NEXT: movq %rdi, %rbx
; X64-NEXT: andq %r10, %rbx
-; X64-NEXT: movabsq $-6149102341220990976, %rdi # imm = 0xAAAA000000000000
-; X64-NEXT: andq %r10, %rdi
; X64-NEXT: shrq %rdi
-; X64-NEXT: leaq (%rdi,%rbx,2), %rdi
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: andq %r10, %rdi
+; X64-NEXT: leaq (%rdi,%rbx,2), %r10
; X64-NEXT: bswapq %rbp
; X64-NEXT: movq %rbp, %rdi
+; X64-NEXT: shrq $4, %rdi
; X64-NEXT: andq %r13, %rdi
-; X64-NEXT: shlq $4, %rdi
-; X64-NEXT: andq %rax, %rbp
-; X64-NEXT: shrq $4, %rbp
+; X64-NEXT: andq %r13, %rbp
+; X64-NEXT: shlq $4, %rbp
; X64-NEXT: orq %rdi, %rbp
; X64-NEXT: movq %rbp, %rdi
; X64-NEXT: andq %r11, %rdi
-; X64-NEXT: andq %r14, %rbp
-; X64-NEXT: shrq $2, %rbp
-; X64-NEXT: leaq (%rbp,%rdi,4), %rbp
-; X64-NEXT: movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555
-; X64-NEXT: movq %rbp, %r10
-; X64-NEXT: andq %rbx, %r10
-; X64-NEXT: movabsq $-6148914691236517206, %rdi # imm = 0xAAAAAAAAAAAAAAAA
-; X64-NEXT: andq %rdi, %rbp
-; X64-NEXT: shrq %rbp
-; X64-NEXT: leaq (%rbp,%r10,2), %rbp
-; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; X64-NEXT: bswapq %rbp
-; X64-NEXT: movq %rbp, %r10
-; X64-NEXT: andq %r13, %r10
-; X64-NEXT: shlq $4, %r10
-; X64-NEXT: andq %rax, %rbp
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: shrq $4, %rbp
-; X64-NEXT: orq %r10, %rbp
-; X64-NEXT: movq %rbp, %r10
-; X64-NEXT: andq %r11, %r10
-; X64-NEXT: andq %r14, %rbp
; X64-NEXT: shrq $2, %rbp
-; X64-NEXT: leaq (%rbp,%r10,4), %rbp
-; X64-NEXT: movq %rbp, %r10
-; X64-NEXT: andq %rbx, %r10
-; X64-NEXT: andq %rdi, %rbp
-; X64-NEXT: shrq %rbp
-; X64-NEXT: leaq (%rbp,%r10,2), %rbp
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; X64-NEXT: bswapq %r10
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: andq %r13, %rax
-; X64-NEXT: shlq $4, %rax
+; X64-NEXT: andq %r11, %rbp
+; X64-NEXT: leaq (%rbp,%rdi,4), %rdi
+; X64-NEXT: movabsq $6148914691236517205, %rbp # imm = 0x5555555555555555
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: andq %rbp, %rbx
+; X64-NEXT: shrq %rdi
+; X64-NEXT: andq %rbp, %rdi
+; X64-NEXT: leaq (%rdi,%rbx,2), %r14
+; X64-NEXT: shrdq $48, %r14, %r10
+; X64-NEXT: bswapq %r15
; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: andq %r15, %r10
-; X64-NEXT: shrq $4, %r10
-; X64-NEXT: orq %rax, %r10
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: andq %r11, %rax
-; X64-NEXT: andq %r14, %r10
-; X64-NEXT: shrq $2, %r10
-; X64-NEXT: leaq (%r10,%rax,4), %rax
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: andq %rbx, %r10
-; X64-NEXT: movabsq $-6148914691236517206, %r15 # imm = 0xAAAAAAAAAAAAAAAA
-; X64-NEXT: andq %r15, %rax
-; X64-NEXT: shrq %rax
-; X64-NEXT: leaq (%rax,%r10,2), %r10
+; X64-NEXT: shrq $4, %rdi
+; X64-NEXT: andq %r13, %rdi
+; X64-NEXT: andq %r13, %r15
+; X64-NEXT: shlq $4, %r15
+; X64-NEXT: orq %rdi, %r15
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: andq %r11, %rdi
+; X64-NEXT: shrq $2, %r15
+; X64-NEXT: andq %r11, %r15
+; X64-NEXT: leaq (%r15,%rdi,4), %rdi
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: andq %rbp, %rbx
+; X64-NEXT: shrq %rdi
+; X64-NEXT: andq %rbp, %rdi
+; X64-NEXT: leaq (%rdi,%rbx,2), %r15
+; X64-NEXT: shrdq $48, %r15, %r14
+; X64-NEXT: bswapq %r12
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: shrq $4, %rdi
+; X64-NEXT: andq %r13, %rdi
+; X64-NEXT: andq %r13, %r12
+; X64-NEXT: shlq $4, %r12
+; X64-NEXT: orq %rdi, %r12
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: andq %r11, %rdi
+; X64-NEXT: shrq $2, %r12
+; X64-NEXT: andq %r11, %r12
+; X64-NEXT: leaq (%r12,%rdi,4), %rdi
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: andq %rbp, %rbx
+; X64-NEXT: shrq %rdi
+; X64-NEXT: andq %rbp, %rdi
+; X64-NEXT: leaq (%rdi,%rbx,2), %r12
+; X64-NEXT: shrdq $48, %r12, %r15
; X64-NEXT: bswapq %r9
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: andq %r13, %rax
-; X64-NEXT: shlq $4, %rax
-; X64-NEXT: andq %rdi, %r9
-; X64-NEXT: shrq $4, %r9
-; X64-NEXT: orq %rax, %r9
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: andq %r11, %rax
-; X64-NEXT: andq %r14, %r9
+; X64-NEXT: movq %r9, %rdi
+; X64-NEXT: shrq $4, %rdi
+; X64-NEXT: andq %r13, %rdi
+; X64-NEXT: andq %r13, %r9
+; X64-NEXT: shlq $4, %r9
+; X64-NEXT: orq %rdi, %r9
+; X64-NEXT: movq %r9, %rdi
+; X64-NEXT: andq %r11, %rdi
; X64-NEXT: shrq $2, %r9
-; X64-NEXT: leaq (%r9,%rax,4), %rax
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: andq %rbx, %r9
-; X64-NEXT: andq %r15, %rax
-; X64-NEXT: shrq %rax
-; X64-NEXT: leaq (%rax,%r9,2), %r9
+; X64-NEXT: andq %r11, %r9
+; X64-NEXT: leaq (%r9,%rdi,4), %rdi
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: andq %rbp, %rbx
+; X64-NEXT: shrq %rdi
+; X64-NEXT: andq %rbp, %rdi
+; X64-NEXT: leaq (%rdi,%rbx,2), %r9
+; X64-NEXT: shrdq $48, %r9, %r12
; X64-NEXT: bswapq %r8
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: andq %r13, %rax
-; X64-NEXT: shlq $4, %rax
-; X64-NEXT: andq %rdi, %r8
-; X64-NEXT: shrq $4, %r8
-; X64-NEXT: orq %rax, %r8
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: andq %r11, %rax
-; X64-NEXT: andq %r14, %r8
+; X64-NEXT: movq %r8, %rdi
+; X64-NEXT: shrq $4, %rdi
+; X64-NEXT: andq %r13, %rdi
+; X64-NEXT: andq %r13, %r8
+; X64-NEXT: shlq $4, %r8
+; X64-NEXT: orq %rdi, %r8
+; X64-NEXT: movq %r8, %rdi
+; X64-NEXT: andq %r11, %rdi
; X64-NEXT: shrq $2, %r8
-; X64-NEXT: leaq (%r8,%rax,4), %rax
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: andq %rbx, %r8
-; X64-NEXT: andq %r15, %rax
-; X64-NEXT: shrq %rax
-; X64-NEXT: leaq (%rax,%r8,2), %r8
+; X64-NEXT: andq %r11, %r8
+; X64-NEXT: leaq (%r8,%rdi,4), %rdi
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: andq %rbp, %rbx
+; X64-NEXT: shrq %rdi
+; X64-NEXT: andq %rbp, %rdi
+; X64-NEXT: leaq (%rdi,%rbx,2), %rdi
+; X64-NEXT: shrdq $48, %rdi, %r9
; X64-NEXT: bswapq %rcx
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: andq %r13, %rax
-; X64-NEXT: shlq $4, %rax
-; X64-NEXT: andq %rdi, %rcx
-; X64-NEXT: shrq $4, %rcx
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: andq %r11, %rax
-; X64-NEXT: andq %r14, %rcx
+; X64-NEXT: movq %rcx, %rbx
+; X64-NEXT: shrq $4, %rbx
+; X64-NEXT: andq %r13, %rbx
+; X64-NEXT: andq %r13, %rcx
+; X64-NEXT: shlq $4, %rcx
+; X64-NEXT: orq %rbx, %rcx
+; X64-NEXT: movq %rcx, %rbx
+; X64-NEXT: andq %r11, %rbx
; X64-NEXT: shrq $2, %rcx
-; X64-NEXT: leaq (%rcx,%rax,4), %rax
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: andq %rbx, %rcx
-; X64-NEXT: andq %r15, %rax
-; X64-NEXT: shrq %rax
-; X64-NEXT: leaq (%rax,%rcx,2), %rcx
+; X64-NEXT: andq %r11, %rcx
+; X64-NEXT: leaq (%rcx,%rbx,4), %rcx
+; X64-NEXT: movq %rcx, %rbx
+; X64-NEXT: andq %rbp, %rbx
+; X64-NEXT: shrq %rcx
+; X64-NEXT: andq %rbp, %rcx
+; X64-NEXT: leaq (%rcx,%rbx,2), %rcx
+; X64-NEXT: shrdq $48, %rcx, %rdi
; X64-NEXT: bswapq %rdx
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: andq %r13, %rax
-; X64-NEXT: shlq $4, %rax
-; X64-NEXT: andq %rdi, %rdx
-; X64-NEXT: shrq $4, %rdx
-; X64-NEXT: orq %rax, %rdx
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: andq %r11, %rax
-; X64-NEXT: andq %r14, %rdx
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: shrq $4, %rbx
+; X64-NEXT: andq %r13, %rbx
+; X64-NEXT: andq %r13, %rdx
+; X64-NEXT: shlq $4, %rdx
+; X64-NEXT: orq %rbx, %rdx
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: andq %r11, %rbx
; X64-NEXT: shrq $2, %rdx
-; X64-NEXT: leaq (%rdx,%rax,4), %rax
-; X64-NEXT: movq %rax, %rdx
-; X64-NEXT: andq %rbx, %rdx
-; X64-NEXT: andq %r15, %rax
-; X64-NEXT: shrq %rax
-; X64-NEXT: leaq (%rax,%rdx,2), %rax
-; X64-NEXT: bswapq %rsi
-; X64-NEXT: andq %rsi, %r13
-; X64-NEXT: andq %rdi, %rsi
-; X64-NEXT: shlq $4, %r13
-; X64-NEXT: shrq $4, %rsi
-; X64-NEXT: orq %r13, %rsi
-; X64-NEXT: andq %rsi, %r11
-; X64-NEXT: andq %r14, %rsi
-; X64-NEXT: shrq $2, %rsi
-; X64-NEXT: leaq (%rsi,%r11,4), %rdx
-; X64-NEXT: andq %rdx, %rbx
-; X64-NEXT: andq %r15, %rdx
+; X64-NEXT: andq %r11, %rdx
+; X64-NEXT: leaq (%rdx,%rbx,4), %rdx
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: andq %rbp, %rbx
; X64-NEXT: shrq %rdx
+; X64-NEXT: andq %rbp, %rdx
; X64-NEXT: leaq (%rdx,%rbx,2), %rdx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: shrdq $48, %rdi, %rsi
-; X64-NEXT: shrdq $48, %rbp, %rdi
-; X64-NEXT: shrdq $48, %r10, %rbp
-; X64-NEXT: shrdq $48, %r9, %r10
-; X64-NEXT: shrdq $48, %r8, %r9
-; X64-NEXT: shrdq $48, %rcx, %r8
-; X64-NEXT: shrdq $48, %rax, %rcx
-; X64-NEXT: shrdq $48, %rdx, %rax
-; X64-NEXT: movq %rax, 56(%r12)
-; X64-NEXT: movq %rcx, 48(%r12)
-; X64-NEXT: movq %r8, 40(%r12)
-; X64-NEXT: movq %r9, 32(%r12)
-; X64-NEXT: movq %r10, 24(%r12)
-; X64-NEXT: movq %rbp, 16(%r12)
-; X64-NEXT: movq %rdi, 8(%r12)
-; X64-NEXT: movq %rsi, (%r12)
-; X64-NEXT: shrq $48, %rdx
-; X64-NEXT: movw %dx, 64(%r12)
-; X64-NEXT: movq %r12, %rax
+; X64-NEXT: shrdq $48, %rdx, %rcx
+; X64-NEXT: bswapq %rsi
+; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: shrq $4, %rbx
+; X64-NEXT: andq %r13, %rbx
+; X64-NEXT: andq %r13, %rsi
+; X64-NEXT: shlq $4, %rsi
+; X64-NEXT: orq %rbx, %rsi
+; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: andq %r11, %rbx
+; X64-NEXT: shrq $2, %rsi
+; X64-NEXT: andq %r11, %rsi
+; X64-NEXT: leaq (%rsi,%rbx,4), %rsi
+; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: andq %rbp, %rbx
+; X64-NEXT: shrq %rsi
+; X64-NEXT: andq %rbp, %rsi
+; X64-NEXT: leaq (%rsi,%rbx,2), %rsi
+; X64-NEXT: shrdq $48, %rsi, %rdx
+; X64-NEXT: shrq $48, %rsi
+; X64-NEXT: movq %rdx, 56(%rax)
+; X64-NEXT: movq %rcx, 48(%rax)
+; X64-NEXT: movq %rdi, 40(%rax)
+; X64-NEXT: movq %r9, 32(%rax)
+; X64-NEXT: movq %r12, 24(%rax)
+; X64-NEXT: movq %r15, 16(%rax)
+; X64-NEXT: movq %r14, 8(%rax)
+; X64-NEXT: movq %r10, (%rax)
+; X64-NEXT: movw %si, 64(%rax)
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
; X64-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll
index 8c41f533fd6b2..4a50f7c879adb 100644
--- a/llvm/test/CodeGen/X86/combine-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll
@@ -55,16 +55,18 @@ define <4 x i32> @test_demandedbits_bitreverse(<4 x i32> %a0) nounwind {
; X86-NEXT: psrlw $4, %xmm0
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: por %xmm1, %xmm0
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; X86-NEXT: pand %xmm0, %xmm1
-; X86-NEXT: psllw $2, %xmm1
-; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT: psrlw $2, %xmm0
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrlw $2, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X86-NEXT: pand %xmm2, %xmm1
+; X86-NEXT: pand %xmm2, %xmm0
+; X86-NEXT: psllw $2, %xmm0
; X86-NEXT: por %xmm1, %xmm0
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; X86-NEXT: pand %xmm0, %xmm1
+; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psrlw $1, %xmm1
-; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; X86-NEXT: pand %xmm2, %xmm1
+; X86-NEXT: pand %xmm2, %xmm0
; X86-NEXT: paddb %xmm0, %xmm0
; X86-NEXT: por %xmm1, %xmm0
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr43820.ll b/llvm/test/CodeGen/X86/pr43820.ll
index 5bdf7872d61a0..2cbced7053e87 100644
--- a/llvm/test/CodeGen/X86/pr43820.ll
+++ b/llvm/test/CodeGen/X86/pr43820.ll
@@ -10,363 +10,362 @@ define i1000 @square(i1000 %A) nounwind {
; CHECK-NEXT: pushq %r13
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; CHECK-NEXT: bswapq %rbp
+; CHECK-NEXT: movq %rbp, %r11
+; CHECK-NEXT: shrq $4, %r11
+; CHECK-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; CHECK-NEXT: andq %rsi, %r11
+; CHECK-NEXT: andq %rsi, %rbp
+; CHECK-NEXT: shlq $4, %rbp
+; CHECK-NEXT: orq %r11, %rbp
+; CHECK-NEXT: movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333
+; CHECK-NEXT: movq %rbp, %r12
+; CHECK-NEXT: andq %rdi, %r12
+; CHECK-NEXT: shrq $2, %rbp
+; CHECK-NEXT: andq %rdi, %rbp
+; CHECK-NEXT: leaq (%rbp,%r12,4), %rbp
+; CHECK-NEXT: movabsq $6148914691230924800, %r12 # imm = 0x5555555555000000
+; CHECK-NEXT: movq %rbp, %r13
+; CHECK-NEXT: andq %r12, %r13
+; CHECK-NEXT: shrq %rbp
+; CHECK-NEXT: andq %r12, %rbp
+; CHECK-NEXT: leaq (%rbp,%r13,2), %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %rbx
-; CHECK-NEXT: movabsq $1085102592571150095, %rdi # imm = 0xF0F0F0F0F0F0F0F
; CHECK-NEXT: movq %rbx, %rbp
-; CHECK-NEXT: andq %rdi, %rbp
-; CHECK-NEXT: shlq $4, %rbp
-; CHECK-NEXT: movabsq $-1085102592571150096, %r11 # imm = 0xF0F0F0F0F0F0F0F0
-; CHECK-NEXT: andq %r11, %rbx
-; CHECK-NEXT: movq %r11, %rax
-; CHECK-NEXT: shrq $4, %rbx
+; CHECK-NEXT: shrq $4, %rbp
+; CHECK-NEXT: andq %rsi, %rbp
+; CHECK-NEXT: andq %rsi, %rbx
+; CHECK-NEXT: shlq $4, %rbx
; CHECK-NEXT: orq %rbp, %rbx
-; CHECK-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333
-; CHECK-NEXT: movq %rbx, %r14
-; CHECK-NEXT: andq %r11, %r14
-; CHECK-NEXT: movabsq $-3689348814741910324, %rbp # imm = 0xCCCCCCCCCCCCCCCC
-; CHECK-NEXT: andq %rbp, %rbx
-; CHECK-NEXT: movq %rbp, %r15
+; CHECK-NEXT: movq %rbx, %rbp
+; CHECK-NEXT: andq %rdi, %rbp
; CHECK-NEXT: shrq $2, %rbx
-; CHECK-NEXT: leaq (%rbx,%r14,4), %r14
-; CHECK-NEXT: movabsq $6148914691230924800, %rbx # imm = 0x5555555555000000
-; CHECK-NEXT: andq %r14, %rbx
-; CHECK-NEXT: movabsq $-6148914691247702016, %rbp # imm = 0xAAAAAAAAAA000000
-; CHECK-NEXT: andq %r14, %rbp
-; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: leaq (%rbp,%rbx,2), %rbx
-; CHECK-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: bswapq %r10
-; CHECK-NEXT: movq %r10, %rbx
; CHECK-NEXT: andq %rdi, %rbx
-; CHECK-NEXT: shlq $4, %rbx
-; CHECK-NEXT: andq %rax, %r10
-; CHECK-NEXT: shrq $4, %r10
-; CHECK-NEXT: orq %rbx, %r10
-; CHECK-NEXT: movq %r10, %rbx
-; CHECK-NEXT: andq %r11, %rbx
-; CHECK-NEXT: andq %r15, %r10
-; CHECK-NEXT: shrq $2, %r10
-; CHECK-NEXT: leaq (%r10,%rbx,4), %rbp
+; CHECK-NEXT: leaq (%rbx,%rbp,4), %rbp
; CHECK-NEXT: movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: movabsq $-6148914691236517206, %r13 # imm = 0xAAAAAAAAAAAAAAAA
-; CHECK-NEXT: andq %r13, %rbp
+; CHECK-NEXT: movq %rbp, %r12
+; CHECK-NEXT: andq %rbx, %r12
; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT: bswapq %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %rdi, %r10
-; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: andq %rax, %rbp
+; CHECK-NEXT: andq %rbx, %rbp
+; CHECK-NEXT: leaq (%rbp,%r12,2), %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: bswapq %r15
+; CHECK-NEXT: movq %r15, %rbp
; CHECK-NEXT: shrq $4, %rbp
-; CHECK-NEXT: orq %r10, %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %r11, %r10
-; CHECK-NEXT: andq %r15, %rbp
-; CHECK-NEXT: shrq $2, %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: andq %r13, %rbp
+; CHECK-NEXT: andq %rsi, %rbp
+; CHECK-NEXT: andq %rsi, %r15
+; CHECK-NEXT: shlq $4, %r15
+; CHECK-NEXT: orq %rbp, %r15
+; CHECK-NEXT: movq %r15, %rbp
+; CHECK-NEXT: andq %rdi, %rbp
+; CHECK-NEXT: shrq $2, %r15
+; CHECK-NEXT: andq %rdi, %r15
+; CHECK-NEXT: leaq (%r15,%rbp,4), %rbp
+; CHECK-NEXT: movq %rbp, %r15
+; CHECK-NEXT: andq %rbx, %r15
; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT: bswapq %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %rdi, %r10
-; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: andq %rax, %rbp
+; CHECK-NEXT: andq %rbx, %rbp
+; CHECK-NEXT: leaq (%rbp,%r15,2), %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: bswapq %r14
+; CHECK-NEXT: movq %r14, %rbp
; CHECK-NEXT: shrq $4, %rbp
-; CHECK-NEXT: orq %r10, %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %r11, %r10
-; CHECK-NEXT: andq %r15, %rbp
-; CHECK-NEXT: shrq $2, %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: andq %r13, %rbp
+; CHECK-NEXT: andq %rsi, %rbp
+; CHECK-NEXT: andq %rsi, %r14
+; CHECK-NEXT: shlq $4, %r14
+; CHECK-NEXT: orq %rbp, %r14
+; CHECK-NEXT: movq %r14, %rbp
+; CHECK-NEXT: andq %rdi, %rbp
+; CHECK-NEXT: shrq $2, %r14
+; CHECK-NEXT: andq %rdi, %r14
+; CHECK-NEXT: leaq (%r14,%rbp,4), %rbp
+; CHECK-NEXT: movq %rbp, %r14
+; CHECK-NEXT: andq %rbx, %r14
; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT: bswapq %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %rdi, %r10
-; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: andq %rax, %rbp
+; CHECK-NEXT: andq %rbx, %rbp
+; CHECK-NEXT: leaq (%rbp,%r14,2), %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: bswapq %r10
+; CHECK-NEXT: movq %r10, %rbp
; CHECK-NEXT: shrq $4, %rbp
-; CHECK-NEXT: orq %r10, %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %r11, %r10
-; CHECK-NEXT: andq %r15, %rbp
-; CHECK-NEXT: shrq $2, %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: andq %r13, %rbp
-; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT: bswapq %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %rdi, %r10
+; CHECK-NEXT: andq %rsi, %rbp
+; CHECK-NEXT: andq %rsi, %r10
; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: andq %rax, %rbp
-; CHECK-NEXT: movq %rax, %r14
-; CHECK-NEXT: shrq $4, %rbp
-; CHECK-NEXT: orq %r10, %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %r11, %r10
-; CHECK-NEXT: andq %r15, %rbp
-; CHECK-NEXT: shrq $2, %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
+; CHECK-NEXT: orq %rbp, %r10
+; CHECK-NEXT: movq %r10, %rbp
+; CHECK-NEXT: andq %rdi, %rbp
+; CHECK-NEXT: shrq $2, %r10
+; CHECK-NEXT: andq %rdi, %r10
+; CHECK-NEXT: leaq (%r10,%rbp,4), %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: andq %r13, %rbp
; CHECK-NEXT: shrq %rbp
+; CHECK-NEXT: andq %rbx, %rbp
; CHECK-NEXT: leaq (%rbp,%r10,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
; CHECK-NEXT: bswapq %rbp
; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %rdi, %r10
-; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: andq %r14, %rbp
-; CHECK-NEXT: shrq $4, %rbp
+; CHECK-NEXT: shrq $4, %r10
+; CHECK-NEXT: andq %rsi, %r10
+; CHECK-NEXT: andq %rsi, %rbp
+; CHECK-NEXT: shlq $4, %rbp
; CHECK-NEXT: orq %r10, %rbp
; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %r11, %r10
-; CHECK-NEXT: andq %r15, %rbp
-; CHECK-NEXT: shrq $2, %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: andq %r13, %rbp
-; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT: bswapq %rbp
-; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rdi, %r10
-; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: andq %r14, %rbp
-; CHECK-NEXT: shrq $4, %rbp
-; CHECK-NEXT: orq %r10, %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %r11, %r10
-; CHECK-NEXT: andq %r15, %rbp
; CHECK-NEXT: shrq $2, %rbp
+; CHECK-NEXT: andq %rdi, %rbp
; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: andq %r13, %rbp
; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT: bswapq %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %rdi, %r10
+; CHECK-NEXT: andq %rbx, %rbp
+; CHECK-NEXT: leaq (%rbp,%r10,2), %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: bswapq %r10
+; CHECK-NEXT: movq %r10, %r14
+; CHECK-NEXT: shrq $4, %r14
+; CHECK-NEXT: andq %rsi, %r14
+; CHECK-NEXT: andq %rsi, %r10
; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: andq %r14, %rbp
-; CHECK-NEXT: shrq $4, %rbp
-; CHECK-NEXT: orq %r10, %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %r11, %r10
-; CHECK-NEXT: andq %r15, %rbp
-; CHECK-NEXT: shrq $2, %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT: movq %rbp, %r10
+; CHECK-NEXT: orq %r14, %r10
+; CHECK-NEXT: movq %r10, %r14
+; CHECK-NEXT: andq %rdi, %r14
+; CHECK-NEXT: shrq $2, %r10
+; CHECK-NEXT: andq %rdi, %r10
+; CHECK-NEXT: movq %rdi, %rbp
+; CHECK-NEXT: leaq (%r10,%r14,4), %r10
+; CHECK-NEXT: movq %r10, %r14
+; CHECK-NEXT: andq %rbx, %r14
+; CHECK-NEXT: shrq %r10
; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: andq %r13, %rbp
-; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT: bswapq %rbp
-; CHECK-NEXT: movq %rbp, %r10
+; CHECK-NEXT: leaq (%r10,%r14,2), %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: bswapq %r10
+; CHECK-NEXT: movq %r10, %r14
+; CHECK-NEXT: shrq $4, %r14
+; CHECK-NEXT: andq %rsi, %r14
+; CHECK-NEXT: andq %rsi, %r10
+; CHECK-NEXT: shlq $4, %r10
+; CHECK-NEXT: orq %r14, %r10
+; CHECK-NEXT: movq %r10, %r14
+; CHECK-NEXT: andq %rdi, %r14
+; CHECK-NEXT: shrq $2, %r10
; CHECK-NEXT: andq %rdi, %r10
+; CHECK-NEXT: leaq (%r10,%r14,4), %r10
+; CHECK-NEXT: movq %r10, %r14
+; CHECK-NEXT: andq %rbx, %r14
+; CHECK-NEXT: shrq %r10
+; CHECK-NEXT: andq %rbx, %r10
+; CHECK-NEXT: leaq (%r10,%r14,2), %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: bswapq %r10
+; CHECK-NEXT: movq %r10, %r14
+; CHECK-NEXT: shrq $4, %r14
+; CHECK-NEXT: andq %rsi, %r14
+; CHECK-NEXT: andq %rsi, %r10
; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: andq %r14, %rbp
-; CHECK-NEXT: shrq $4, %rbp
-; CHECK-NEXT: orq %r10, %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %r11, %r10
-; CHECK-NEXT: andq %r15, %rbp
-; CHECK-NEXT: shrq $2, %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT: movq %rbp, %r10
+; CHECK-NEXT: orq %r14, %r10
+; CHECK-NEXT: movq %r10, %r14
+; CHECK-NEXT: andq %rdi, %r14
+; CHECK-NEXT: shrq $2, %r10
+; CHECK-NEXT: andq %rdi, %r10
+; CHECK-NEXT: leaq (%r10,%r14,4), %r10
+; CHECK-NEXT: movq %r10, %r14
+; CHECK-NEXT: andq %rbx, %r14
+; CHECK-NEXT: shrq %r10
; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: andq %r13, %rbp
-; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT: bswapq %rbp
-; CHECK-NEXT: movq %rbp, %r10
+; CHECK-NEXT: leaq (%r10,%r14,2), %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: bswapq %r10
+; CHECK-NEXT: movq %r10, %r14
+; CHECK-NEXT: shrq $4, %r14
+; CHECK-NEXT: andq %rsi, %r14
+; CHECK-NEXT: andq %rsi, %r10
+; CHECK-NEXT: shlq $4, %r10
+; CHECK-NEXT: orq %r14, %r10
+; CHECK-NEXT: movq %r10, %r14
+; CHECK-NEXT: andq %rdi, %r14
+; CHECK-NEXT: shrq $2, %r10
; CHECK-NEXT: andq %rdi, %r10
+; CHECK-NEXT: leaq (%r10,%r14,4), %r10
+; CHECK-NEXT: movq %r10, %r14
+; CHECK-NEXT: andq %rbx, %r14
+; CHECK-NEXT: shrq %r10
+; CHECK-NEXT: andq %rbx, %r10
+; CHECK-NEXT: leaq (%r10,%r14,2), %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: bswapq %r10
+; CHECK-NEXT: movq %r10, %rax
+; CHECK-NEXT: shrq $4, %rax
+; CHECK-NEXT: andq %rsi, %rax
+; CHECK-NEXT: andq %rsi, %r10
; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: andq %r14, %rbp
-; CHECK-NEXT: shrq $4, %rbp
-; CHECK-NEXT: orq %r10, %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %r11, %r10
-; CHECK-NEXT: andq %r15, %rbp
-; CHECK-NEXT: shrq $2, %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT: movq %rbp, %r10
+; CHECK-NEXT: orq %rax, %r10
+; CHECK-NEXT: movq %r10, %rax
+; CHECK-NEXT: andq %rdi, %rax
+; CHECK-NEXT: shrq $2, %r10
+; CHECK-NEXT: andq %rdi, %r10
+; CHECK-NEXT: leaq (%r10,%rax,4), %rax
+; CHECK-NEXT: movq %rax, %r10
; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: andq %r13, %rbp
-; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
-; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: andq %rbx, %rax
+; CHECK-NEXT: leaq (%rax,%r10,2), %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %r9
-; CHECK-NEXT: movq %r9, %rbp
-; CHECK-NEXT: andq %rdi, %rbp
-; CHECK-NEXT: shlq $4, %rbp
-; CHECK-NEXT: andq %r14, %r9
-; CHECK-NEXT: shrq $4, %r9
-; CHECK-NEXT: orq %rbp, %r9
-; CHECK-NEXT: movq %r9, %rbp
-; CHECK-NEXT: andq %r11, %rbp
-; CHECK-NEXT: andq %r15, %r9
+; CHECK-NEXT: movq %r9, %rax
+; CHECK-NEXT: shrq $4, %rax
+; CHECK-NEXT: andq %rsi, %rax
+; CHECK-NEXT: andq %rsi, %r9
+; CHECK-NEXT: shlq $4, %r9
+; CHECK-NEXT: orq %rax, %r9
+; CHECK-NEXT: movq %r9, %rax
+; CHECK-NEXT: andq %rdi, %rax
; CHECK-NEXT: shrq $2, %r9
-; CHECK-NEXT: leaq (%r9,%rbp,4), %rbp
-; CHECK-NEXT: movq %rbp, %r9
+; CHECK-NEXT: andq %rdi, %r9
+; CHECK-NEXT: leaq (%r9,%rax,4), %rax
+; CHECK-NEXT: movq %rax, %r9
; CHECK-NEXT: andq %rbx, %r9
-; CHECK-NEXT: andq %r13, %rbp
-; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: leaq (%rbp,%r9,2), %rbp
-; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: andq %rbx, %rax
+; CHECK-NEXT: leaq (%rax,%r9,2), %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %r8
-; CHECK-NEXT: movq %r8, %rbp
-; CHECK-NEXT: andq %rdi, %rbp
-; CHECK-NEXT: shlq $4, %rbp
-; CHECK-NEXT: andq %r14, %r8
-; CHECK-NEXT: shrq $4, %r8
-; CHECK-NEXT: orq %rbp, %r8
-; CHECK-NEXT: movq %r8, %rbp
-; CHECK-NEXT: andq %r11, %rbp
-; CHECK-NEXT: andq %r15, %r8
-; CHECK-NEXT: movq %r15, %r9
+; CHECK-NEXT: movq %r8, %rax
+; CHECK-NEXT: shrq $4, %rax
+; CHECK-NEXT: andq %rsi, %rax
+; CHECK-NEXT: andq %rsi, %r8
+; CHECK-NEXT: shlq $4, %r8
+; CHECK-NEXT: orq %rax, %r8
+; CHECK-NEXT: movq %r8, %rax
+; CHECK-NEXT: andq %rdi, %rax
; CHECK-NEXT: shrq $2, %r8
-; CHECK-NEXT: leaq (%r8,%rbp,4), %rbp
-; CHECK-NEXT: movq %rbp, %r8
+; CHECK-NEXT: andq %rdi, %r8
+; CHECK-NEXT: leaq (%r8,%rax,4), %rax
+; CHECK-NEXT: movq %rax, %r8
; CHECK-NEXT: andq %rbx, %r8
-; CHECK-NEXT: andq %r13, %rbp
-; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: leaq (%rbp,%r8,2), %rbp
-; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: andq %rbx, %rax
+; CHECK-NEXT: leaq (%rax,%r8,2), %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %rcx
-; CHECK-NEXT: movq %rcx, %rbp
-; CHECK-NEXT: andq %rdi, %rbp
-; CHECK-NEXT: shlq $4, %rbp
-; CHECK-NEXT: andq %r14, %rcx
-; CHECK-NEXT: shrq $4, %rcx
-; CHECK-NEXT: orq %rbp, %rcx
-; CHECK-NEXT: movq %rcx, %rbp
-; CHECK-NEXT: andq %r11, %rbp
-; CHECK-NEXT: andq %r15, %rcx
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: shrq $4, %rax
+; CHECK-NEXT: andq %rsi, %rax
+; CHECK-NEXT: andq %rsi, %rcx
+; CHECK-NEXT: shlq $4, %rcx
+; CHECK-NEXT: orq %rax, %rcx
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: andq %rdi, %rax
; CHECK-NEXT: shrq $2, %rcx
-; CHECK-NEXT: leaq (%rcx,%rbp,4), %rcx
-; CHECK-NEXT: movq %rcx, %rbp
-; CHECK-NEXT: andq %rbx, %rbp
-; CHECK-NEXT: andq %r13, %rcx
-; CHECK-NEXT: shrq %rcx
-; CHECK-NEXT: leaq (%rcx,%rbp,2), %r15
+; CHECK-NEXT: andq %rdi, %rcx
+; CHECK-NEXT: leaq (%rcx,%rax,4), %rax
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: andq %rbx, %rcx
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: andq %rbx, %rax
+; CHECK-NEXT: leaq (%rax,%rcx,2), %r12
; CHECK-NEXT: bswapq %rdx
-; CHECK-NEXT: movq %rdx, %rbp
-; CHECK-NEXT: andq %rdi, %rbp
-; CHECK-NEXT: shlq $4, %rbp
-; CHECK-NEXT: andq %r14, %rdx
-; CHECK-NEXT: shrq $4, %rdx
-; CHECK-NEXT: orq %rbp, %rdx
-; CHECK-NEXT: movq %rdx, %rbp
-; CHECK-NEXT: andq %r11, %rbp
-; CHECK-NEXT: andq %r9, %rdx
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: shrq $4, %rax
+; CHECK-NEXT: andq %rsi, %rax
+; CHECK-NEXT: andq %rsi, %rdx
+; CHECK-NEXT: shlq $4, %rdx
+; CHECK-NEXT: orq %rax, %rdx
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: andq %rdi, %rax
; CHECK-NEXT: shrq $2, %rdx
-; CHECK-NEXT: leaq (%rdx,%rbp,4), %rdx
-; CHECK-NEXT: movq %rdx, %rbp
-; CHECK-NEXT: andq %rbx, %rbp
-; CHECK-NEXT: andq %r13, %rdx
-; CHECK-NEXT: shrq %rdx
-; CHECK-NEXT: leaq (%rdx,%rbp,2), %rdx
-; CHECK-NEXT: bswapq %rsi
-; CHECK-NEXT: andq %rsi, %rdi
-; CHECK-NEXT: andq %r14, %rsi
-; CHECK-NEXT: shlq $4, %rdi
-; CHECK-NEXT: shrq $4, %rsi
-; CHECK-NEXT: orq %rdi, %rsi
-; CHECK-NEXT: andq %rsi, %r11
-; CHECK-NEXT: andq %r9, %rsi
-; CHECK-NEXT: shrq $2, %rsi
-; CHECK-NEXT: leaq (%rsi,%r11,4), %rsi
-; CHECK-NEXT: andq %rsi, %rbx
-; CHECK-NEXT: andq %r13, %rsi
-; CHECK-NEXT: shrq %rsi
-; CHECK-NEXT: leaq (%rsi,%rbx,2), %r13
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; CHECK-NEXT: andq %rdi, %rdx
+; CHECK-NEXT: leaq (%rdx,%rax,4), %rax
+; CHECK-NEXT: movq %rax, %rdx
+; CHECK-NEXT: andq %rbx, %rdx
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: andq %rbx, %rax
+; CHECK-NEXT: leaq (%rax,%rdx,2), %rdi
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT: shrdq $24, %rax, %r11
+; CHECK-NEXT: bswapq %rax
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrq $4, %rcx
+; CHECK-NEXT: andq %rsi, %rcx
+; CHECK-NEXT: andq %rsi, %rax
+; CHECK-NEXT: shlq $4, %rax
+; CHECK-NEXT: orq %rcx, %rax
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: andq %rbp, %rcx
+; CHECK-NEXT: shrq $2, %rax
+; CHECK-NEXT: andq %rbp, %rax
+; CHECK-NEXT: leaq (%rax,%rcx,4), %rax
+; CHECK-NEXT: movq %rax, %rsi
+; CHECK-NEXT: andq %rbx, %rsi
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: andq %rbx, %rax
+; CHECK-NEXT: leaq (%rax,%rsi,2), %rsi
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: shrdq $24, %rax, %rdx
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rcx, %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rbp, %rcx
; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; CHECK-NEXT: shrdq $24, %r12, %rbp
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; CHECK-NEXT: shrdq $24, %r13, %rbp
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; CHECK-NEXT: shrdq $24, %r15, %r13
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; CHECK-NEXT: shrdq $24, %r14, %r12
+; CHECK-NEXT: shrdq $24, %r14, %r15
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rbx, %r14
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; CHECK-NEXT: shrdq $24, %r11, %rbx
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; CHECK-NEXT: shrdq $24, %r10, %rbx
+; CHECK-NEXT: shrdq $24, %r10, %r11
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
; CHECK-NEXT: shrdq $24, %r9, %r10
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
; CHECK-NEXT: shrdq $24, %r8, %r9
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; CHECK-NEXT: shrdq $24, %rdi, %r8
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; CHECK-NEXT: shrdq $24, %rsi, %rdi
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT: shrdq $24, %rax, %rsi
-; CHECK-NEXT: shrdq $24, %r15, %rax
+; CHECK-NEXT: shrdq $24, %rax, %r8
+; CHECK-NEXT: shrdq $24, %r12, %rax
; CHECK-NEXT: movq %rax, %rcx
-; CHECK-NEXT: shrdq $24, %rdx, %r15
-; CHECK-NEXT: shrdq $24, %r13, %rdx
+; CHECK-NEXT: shrdq $24, %rdi, %r12
+; CHECK-NEXT: shrdq $24, %rsi, %rdi
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT: movq %rdx, 112(%rax)
-; CHECK-NEXT: movq %r15, 104(%rax)
+; CHECK-NEXT: movq %rdi, 112(%rax)
+; CHECK-NEXT: movq %r12, 104(%rax)
; CHECK-NEXT: movq %rcx, 96(%rax)
-; CHECK-NEXT: movq %rsi, 88(%rax)
-; CHECK-NEXT: movq %rdi, 80(%rax)
-; CHECK-NEXT: movq %r8, 72(%rax)
-; CHECK-NEXT: movq %r9, 64(%rax)
-; CHECK-NEXT: movq %r10, 56(%rax)
-; CHECK-NEXT: movq %rbx, 48(%rax)
-; CHECK-NEXT: movq %r14, 40(%rax)
-; CHECK-NEXT: movq %r12, 32(%rax)
+; CHECK-NEXT: movq %r8, 88(%rax)
+; CHECK-NEXT: movq %r9, 80(%rax)
+; CHECK-NEXT: movq %r10, 72(%rax)
+; CHECK-NEXT: movq %r11, 64(%rax)
+; CHECK-NEXT: movq %rbx, 56(%rax)
+; CHECK-NEXT: movq %r14, 48(%rax)
+; CHECK-NEXT: movq %r15, 40(%rax)
+; CHECK-NEXT: movq %r13, 32(%rax)
; CHECK-NEXT: movq %rbp, 24(%rax)
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; CHECK-NEXT: movq %rcx, 16(%rax)
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; CHECK-NEXT: movq %rcx, 8(%rax)
-; CHECK-NEXT: movq %r11, (%rax)
-; CHECK-NEXT: movq %r13, %rcx
-; CHECK-NEXT: shrq $56, %r13
-; CHECK-NEXT: movb %r13b, 124(%rax)
+; CHECK-NEXT: movq %rdx, (%rax)
+; CHECK-NEXT: movq %rsi, %rcx
+; CHECK-NEXT: shrq $56, %rsi
+; CHECK-NEXT: movb %sil, 124(%rax)
; CHECK-NEXT: shrq $24, %rcx
; CHECK-NEXT: movl %ecx, 120(%rax)
; CHECK-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index 651418d271be5..3555312b18a1a 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -24,14 +24,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; SSE-NEXT: movl %edi, %eax
; SSE-NEXT: andb $51, %al
; SSE-NEXT: shlb $2, %al
-; SSE-NEXT: andb $-52, %dil
; SSE-NEXT: shrb $2, %dil
+; SSE-NEXT: andb $51, %dil
; SSE-NEXT: orb %al, %dil
; SSE-NEXT: movl %edi, %eax
; SSE-NEXT: andb $85, %al
; SSE-NEXT: addb %al, %al
-; SSE-NEXT: andb $-86, %dil
; SSE-NEXT: shrb %dil
+; SSE-NEXT: andb $85, %dil
; SSE-NEXT: addl %edi, %eax
; SSE-NEXT: # kill: def $al killed $al killed $eax
; SSE-NEXT: retq
@@ -43,14 +43,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; AVX-NEXT: movl %edi, %eax
; AVX-NEXT: andb $51, %al
; AVX-NEXT: shlb $2, %al
-; AVX-NEXT: andb $-52, %dil
; AVX-NEXT: shrb $2, %dil
+; AVX-NEXT: andb $51, %dil
; AVX-NEXT: orb %al, %dil
; AVX-NEXT: movl %edi, %eax
; AVX-NEXT: andb $85, %al
; AVX-NEXT: addb %al, %al
-; AVX-NEXT: andb $-86, %dil
; AVX-NEXT: shrb %dil
+; AVX-NEXT: andb $85, %dil
; AVX-NEXT: addl %edi, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
@@ -70,14 +70,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; GFNISSE-NEXT: movl %edi, %eax
; GFNISSE-NEXT: andb $51, %al
; GFNISSE-NEXT: shlb $2, %al
-; GFNISSE-NEXT: andb $-52, %dil
; GFNISSE-NEXT: shrb $2, %dil
+; GFNISSE-NEXT: andb $51, %dil
; GFNISSE-NEXT: orb %al, %dil
; GFNISSE-NEXT: movl %edi, %eax
; GFNISSE-NEXT: andb $85, %al
; GFNISSE-NEXT: addb %al, %al
-; GFNISSE-NEXT: andb $-86, %dil
; GFNISSE-NEXT: shrb %dil
+; GFNISSE-NEXT: andb $85, %dil
; GFNISSE-NEXT: addl %edi, %eax
; GFNISSE-NEXT: # kill: def $al killed $al killed $eax
; GFNISSE-NEXT: retq
@@ -89,14 +89,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; GFNIAVX-NEXT: movl %edi, %eax
; GFNIAVX-NEXT: andb $51, %al
; GFNIAVX-NEXT: shlb $2, %al
-; GFNIAVX-NEXT: andb $-52, %dil
; GFNIAVX-NEXT: shrb $2, %dil
+; GFNIAVX-NEXT: andb $51, %dil
; GFNIAVX-NEXT: orb %al, %dil
; GFNIAVX-NEXT: movl %edi, %eax
; GFNIAVX-NEXT: andb $85, %al
; GFNIAVX-NEXT: addb %al, %al
-; GFNIAVX-NEXT: andb $-86, %dil
; GFNIAVX-NEXT: shrb %dil
+; GFNIAVX-NEXT: andb $85, %dil
; GFNIAVX-NEXT: addl %edi, %eax
; GFNIAVX-NEXT: # kill: def $al killed $al killed $eax
; GFNIAVX-NEXT: retq
@@ -108,14 +108,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; GFNIAVX2-NEXT: movl %edi, %eax
; GFNIAVX2-NEXT: andb $51, %al
; GFNIAVX2-NEXT: shlb $2, %al
-; GFNIAVX2-NEXT: andb $-52, %dil
; GFNIAVX2-NEXT: shrb $2, %dil
+; GFNIAVX2-NEXT: andb $51, %dil
; GFNIAVX2-NEXT: orb %al, %dil
; GFNIAVX2-NEXT: movl %edi, %eax
; GFNIAVX2-NEXT: andb $85, %al
; GFNIAVX2-NEXT: addb %al, %al
-; GFNIAVX2-NEXT: andb $-86, %dil
; GFNIAVX2-NEXT: shrb %dil
+; GFNIAVX2-NEXT: andb $85, %dil
; GFNIAVX2-NEXT: addl %edi, %eax
; GFNIAVX2-NEXT: # kill: def $al killed $al killed $eax
; GFNIAVX2-NEXT: retq
@@ -127,14 +127,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; GFNIAVX512F-NEXT: movl %edi, %eax
; GFNIAVX512F-NEXT: andb $51, %al
; GFNIAVX512F-NEXT: shlb $2, %al
-; GFNIAVX512F-NEXT: andb $-52, %dil
; GFNIAVX512F-NEXT: shrb $2, %dil
+; GFNIAVX512F-NEXT: andb $51, %dil
; GFNIAVX512F-NEXT: orb %al, %dil
; GFNIAVX512F-NEXT: movl %edi, %eax
; GFNIAVX512F-NEXT: andb $85, %al
; GFNIAVX512F-NEXT: addb %al, %al
-; GFNIAVX512F-NEXT: andb $-86, %dil
; GFNIAVX512F-NEXT: shrb %dil
+; GFNIAVX512F-NEXT: andb $85, %dil
; GFNIAVX512F-NEXT: addl %edi, %eax
; GFNIAVX512F-NEXT: # kill: def $al killed $al killed $eax
; GFNIAVX512F-NEXT: retq
@@ -146,14 +146,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; GFNIAVX512BW-NEXT: movl %edi, %eax
; GFNIAVX512BW-NEXT: andb $51, %al
; GFNIAVX512BW-NEXT: shlb $2, %al
-; GFNIAVX512BW-NEXT: andb $-52, %dil
; GFNIAVX512BW-NEXT: shrb $2, %dil
+; GFNIAVX512BW-NEXT: andb $51, %dil
; GFNIAVX512BW-NEXT: orb %al, %dil
; GFNIAVX512BW-NEXT: movl %edi, %eax
; GFNIAVX512BW-NEXT: andb $85, %al
; GFNIAVX512BW-NEXT: addb %al, %al
-; GFNIAVX512BW-NEXT: andb $-86, %dil
; GFNIAVX512BW-NEXT: shrb %dil
+; GFNIAVX512BW-NEXT: andb $85, %dil
; GFNIAVX512BW-NEXT: addl %edi, %eax
; GFNIAVX512BW-NEXT: # kill: def $al killed $al killed $eax
; GFNIAVX512BW-NEXT: retq
@@ -169,18 +169,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; SSE-NEXT: movl %edi, %eax
; SSE-NEXT: andl $3855, %eax # imm = 0xF0F
; SSE-NEXT: shll $4, %eax
-; SSE-NEXT: andl $61680, %edi # imm = 0xF0F0
; SSE-NEXT: shrl $4, %edi
+; SSE-NEXT: andl $3855, %edi # imm = 0xF0F
; SSE-NEXT: orl %eax, %edi
; SSE-NEXT: movl %edi, %eax
; SSE-NEXT: andl $13107, %eax # imm = 0x3333
-; SSE-NEXT: andl $52428, %edi # imm = 0xCCCC
; SSE-NEXT: shrl $2, %edi
+; SSE-NEXT: andl $13107, %edi # imm = 0x3333
; SSE-NEXT: leal (%rdi,%rax,4), %eax
; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: andl $21845, %ecx # imm = 0x5555
-; SSE-NEXT: andl $43690, %eax # imm = 0xAAAA
; SSE-NEXT: shrl %eax
+; SSE-NEXT: andl $21845, %eax # imm = 0x5555
; SSE-NEXT: leal (%rax,%rcx,2), %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: retq
@@ -192,18 +192,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; AVX-NEXT: movl %edi, %eax
; AVX-NEXT: andl $3855, %eax # imm = 0xF0F
; AVX-NEXT: shll $4, %eax
-; AVX-NEXT: andl $61680, %edi # imm = 0xF0F0
; AVX-NEXT: shrl $4, %edi
+; AVX-NEXT: andl $3855, %edi # imm = 0xF0F
; AVX-NEXT: orl %eax, %edi
; AVX-NEXT: movl %edi, %eax
; AVX-NEXT: andl $13107, %eax # imm = 0x3333
-; AVX-NEXT: andl $52428, %edi # imm = 0xCCCC
; AVX-NEXT: shrl $2, %edi
+; AVX-NEXT: andl $13107, %edi # imm = 0x3333
; AVX-NEXT: leal (%rdi,%rax,4), %eax
; AVX-NEXT: movl %eax, %ecx
; AVX-NEXT: andl $21845, %ecx # imm = 0x5555
-; AVX-NEXT: andl $43690, %eax # imm = 0xAAAA
; AVX-NEXT: shrl %eax
+; AVX-NEXT: andl $21845, %eax # imm = 0x5555
; AVX-NEXT: leal (%rax,%rcx,2), %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
@@ -223,18 +223,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; GFNISSE-NEXT: movl %edi, %eax
; GFNISSE-NEXT: andl $3855, %eax # imm = 0xF0F
; GFNISSE-NEXT: shll $4, %eax
-; GFNISSE-NEXT: andl $61680, %edi # imm = 0xF0F0
; GFNISSE-NEXT: shrl $4, %edi
+; GFNISSE-NEXT: andl $3855, %edi # imm = 0xF0F
; GFNISSE-NEXT: orl %eax, %edi
; GFNISSE-NEXT: movl %edi, %eax
; GFNISSE-NEXT: andl $13107, %eax # imm = 0x3333
-; GFNISSE-NEXT: andl $52428, %edi # imm = 0xCCCC
; GFNISSE-NEXT: shrl $2, %edi
+; GFNISSE-NEXT: andl $13107, %edi # imm = 0x3333
; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax
; GFNISSE-NEXT: movl %eax, %ecx
; GFNISSE-NEXT: andl $21845, %ecx # imm = 0x5555
-; GFNISSE-NEXT: andl $43690, %eax # imm = 0xAAAA
; GFNISSE-NEXT: shrl %eax
+; GFNISSE-NEXT: andl $21845, %eax # imm = 0x5555
; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax
; GFNISSE-NEXT: # kill: def $ax killed $ax killed $eax
; GFNISSE-NEXT: retq
@@ -246,18 +246,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; GFNIAVX-NEXT: movl %edi, %eax
; GFNIAVX-NEXT: andl $3855, %eax # imm = 0xF0F
; GFNIAVX-NEXT: shll $4, %eax
-; GFNIAVX-NEXT: andl $61680, %edi # imm = 0xF0F0
; GFNIAVX-NEXT: shrl $4, %edi
+; GFNIAVX-NEXT: andl $3855, %edi # imm = 0xF0F
; GFNIAVX-NEXT: orl %eax, %edi
; GFNIAVX-NEXT: movl %edi, %eax
; GFNIAVX-NEXT: andl $13107, %eax # imm = 0x3333
-; GFNIAVX-NEXT: andl $52428, %edi # imm = 0xCCCC
; GFNIAVX-NEXT: shrl $2, %edi
+; GFNIAVX-NEXT: andl $13107, %edi # imm = 0x3333
; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax
; GFNIAVX-NEXT: movl %eax, %ecx
; GFNIAVX-NEXT: andl $21845, %ecx # imm = 0x5555
-; GFNIAVX-NEXT: andl $43690, %eax # imm = 0xAAAA
; GFNIAVX-NEXT: shrl %eax
+; GFNIAVX-NEXT: andl $21845, %eax # imm = 0x5555
; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax
; GFNIAVX-NEXT: # kill: def $ax killed $ax killed $eax
; GFNIAVX-NEXT: retq
@@ -269,18 +269,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; GFNIAVX2-NEXT: movl %edi, %eax
; GFNIAVX2-NEXT: andl $3855, %eax # imm = 0xF0F
; GFNIAVX2-NEXT: shll $4, %eax
-; GFNIAVX2-NEXT: andl $61680, %edi # imm = 0xF0F0
; GFNIAVX2-NEXT: shrl $4, %edi
+; GFNIAVX2-NEXT: andl $3855, %edi # imm = 0xF0F
; GFNIAVX2-NEXT: orl %eax, %edi
; GFNIAVX2-NEXT: movl %edi, %eax
; GFNIAVX2-NEXT: andl $13107, %eax # imm = 0x3333
-; GFNIAVX2-NEXT: andl $52428, %edi # imm = 0xCCCC
; GFNIAVX2-NEXT: shrl $2, %edi
+; GFNIAVX2-NEXT: andl $13107, %edi # imm = 0x3333
; GFNIAVX2-NEXT: leal (%rdi,%rax,4), %eax
; GFNIAVX2-NEXT: movl %eax, %ecx
; GFNIAVX2-NEXT: andl $21845, %ecx # imm = 0x5555
-; GFNIAVX2-NEXT: andl $43690, %eax # imm = 0xAAAA
; GFNIAVX2-NEXT: shrl %eax
+; GFNIAVX2-NEXT: andl $21845, %eax # imm = 0x5555
; GFNIAVX2-NEXT: leal (%rax,%rcx,2), %eax
; GFNIAVX2-NEXT: # kill: def $ax killed $ax killed $eax
; GFNIAVX2-NEXT: retq
@@ -292,18 +292,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; GFNIAVX512F-NEXT: movl %edi, %eax
; GFNIAVX512F-NEXT: andl $3855, %eax # imm = 0xF0F
; GFNIAVX512F-NEXT: shll $4, %eax
-; GFNIAVX512F-NEXT: andl $61680, %edi # imm = 0xF0F0
; GFNIAVX512F-NEXT: shrl $4, %edi
+; GFNIAVX512F-NEXT: andl $3855, %edi # imm = 0xF0F
; GFNIAVX512F-NEXT: orl %eax, %edi
; GFNIAVX512F-NEXT: movl %edi, %eax
; GFNIAVX512F-NEXT: andl $13107, %eax # imm = 0x3333
-; GFNIAVX512F-NEXT: andl $52428, %edi # imm = 0xCCCC
; GFNIAVX512F-NEXT: shrl $2, %edi
+; GFNIAVX512F-NEXT: andl $13107, %edi # imm = 0x3333
; GFNIAVX512F-NEXT: leal (%rdi,%rax,4), %eax
; GFNIAVX512F-NEXT: movl %eax, %ecx
; GFNIAVX512F-NEXT: andl $21845, %ecx # imm = 0x5555
-; GFNIAVX512F-NEXT: andl $43690, %eax # imm = 0xAAAA
; GFNIAVX512F-NEXT: shrl %eax
+; GFNIAVX512F-NEXT: andl $21845, %eax # imm = 0x5555
; GFNIAVX512F-NEXT: leal (%rax,%rcx,2), %eax
; GFNIAVX512F-NEXT: # kill: def $ax killed $ax killed $eax
; GFNIAVX512F-NEXT: retq
@@ -315,18 +315,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; GFNIAVX512BW-NEXT: movl %edi, %eax
; GFNIAVX512BW-NEXT: andl $3855, %eax # imm = 0xF0F
; GFNIAVX512BW-NEXT: shll $4, %eax
-; GFNIAVX512BW-NEXT: andl $61680, %edi # imm = 0xF0F0
; GFNIAVX512BW-NEXT: shrl $4, %edi
+; GFNIAVX512BW-NEXT: andl $3855, %edi # imm = 0xF0F
; GFNIAVX512BW-NEXT: orl %eax, %edi
; GFNIAVX512BW-NEXT: movl %edi, %eax
; GFNIAVX512BW-NEXT: andl $13107, %eax # imm = 0x3333
-; GFNIAVX512BW-NEXT: andl $52428, %edi # imm = 0xCCCC
; GFNIAVX512BW-NEXT: shrl $2, %edi
+; GFNIAVX512BW-NEXT: andl $13107, %edi # imm = 0x3333
; GFNIAVX512BW-NEXT: leal (%rdi,%rax,4), %eax
; GFNIAVX512BW-NEXT: movl %eax, %ecx
; GFNIAVX512BW-NEXT: andl $21845, %ecx # imm = 0x5555
-; GFNIAVX512BW-NEXT: andl $43690, %eax # imm = 0xAAAA
; GFNIAVX512BW-NEXT: shrl %eax
+; GFNIAVX512BW-NEXT: andl $21845, %eax # imm = 0x5555
; GFNIAVX512BW-NEXT: leal (%rax,%rcx,2), %eax
; GFNIAVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
; GFNIAVX512BW-NEXT: retq
@@ -342,18 +342,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; SSE-NEXT: movl %edi, %eax
; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; SSE-NEXT: shll $4, %eax
-; SSE-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0
; SSE-NEXT: shrl $4, %edi
+; SSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; SSE-NEXT: orl %eax, %edi
; SSE-NEXT: movl %edi, %eax
; SSE-NEXT: andl $858993459, %eax # imm = 0x33333333
-; SSE-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC
; SSE-NEXT: shrl $2, %edi
+; SSE-NEXT: andl $858993459, %edi # imm = 0x33333333
; SSE-NEXT: leal (%rdi,%rax,4), %eax
; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; SSE-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; SSE-NEXT: shrl %eax
+; SSE-NEXT: andl $1431655765, %eax # imm = 0x55555555
; SSE-NEXT: leal (%rax,%rcx,2), %eax
; SSE-NEXT: retq
;
@@ -364,18 +364,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; AVX-NEXT: movl %edi, %eax
; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; AVX-NEXT: shll $4, %eax
-; AVX-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0
; AVX-NEXT: shrl $4, %edi
+; AVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; AVX-NEXT: orl %eax, %edi
; AVX-NEXT: movl %edi, %eax
; AVX-NEXT: andl $858993459, %eax # imm = 0x33333333
-; AVX-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC
; AVX-NEXT: shrl $2, %edi
+; AVX-NEXT: andl $858993459, %edi # imm = 0x33333333
; AVX-NEXT: leal (%rdi,%rax,4), %eax
; AVX-NEXT: movl %eax, %ecx
; AVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; AVX-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX-NEXT: shrl %eax
+; AVX-NEXT: andl $1431655765, %eax # imm = 0x55555555
; AVX-NEXT: leal (%rax,%rcx,2), %eax
; AVX-NEXT: retq
;
@@ -393,18 +393,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; GFNISSE-NEXT: movl %edi, %eax
; GFNISSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; GFNISSE-NEXT: shll $4, %eax
-; GFNISSE-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0
; GFNISSE-NEXT: shrl $4, %edi
+; GFNISSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; GFNISSE-NEXT: orl %eax, %edi
; GFNISSE-NEXT: movl %edi, %eax
; GFNISSE-NEXT: andl $858993459, %eax # imm = 0x33333333
-; GFNISSE-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC
; GFNISSE-NEXT: shrl $2, %edi
+; GFNISSE-NEXT: andl $858993459, %edi # imm = 0x33333333
; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax
; GFNISSE-NEXT: movl %eax, %ecx
; GFNISSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; GFNISSE-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; GFNISSE-NEXT: shrl %eax
+; GFNISSE-NEXT: andl $1431655765, %eax # imm = 0x55555555
; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax
; GFNISSE-NEXT: retq
;
@@ -415,18 +415,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; GFNIAVX-NEXT: movl %edi, %eax
; GFNIAVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; GFNIAVX-NEXT: shll $4, %eax
-; GFNIAVX-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0
; GFNIAVX-NEXT: shrl $4, %edi
+; GFNIAVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; GFNIAVX-NEXT: orl %eax, %edi
; GFNIAVX-NEXT: movl %edi, %eax
; GFNIAVX-NEXT: andl $858993459, %eax # imm = 0x33333333
-; GFNIAVX-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC
; GFNIAVX-NEXT: shrl $2, %edi
+; GFNIAVX-NEXT: andl $858993459, %edi # imm = 0x33333333
; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax
; GFNIAVX-NEXT: movl %eax, %ecx
; GFNIAVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; GFNIAVX-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; GFNIAVX-NEXT: shrl %eax
+; GFNIAVX-NEXT: andl $1431655765, %eax # imm = 0x55555555
; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax
; GFNIAVX-NEXT: retq
;
@@ -437,18 +437,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; GFNIAVX2-NEXT: movl %edi, %eax
; GFNIAVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; GFNIAVX2-NEXT: shll $4, %eax
-; GFNIAVX2-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0
; GFNIAVX2-NEXT: shrl $4, %edi
+; GFNIAVX2-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; GFNIAVX2-NEXT: orl %eax, %edi
; GFNIAVX2-NEXT: movl %edi, %eax
; GFNIAVX2-NEXT: andl $858993459, %eax # imm = 0x33333333
-; GFNIAVX2-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC
; GFNIAVX2-NEXT: shrl $2, %edi
+; GFNIAVX2-NEXT: andl $858993459, %edi # imm = 0x33333333
; GFNIAVX2-NEXT: leal (%rdi,%rax,4), %eax
; GFNIAVX2-NEXT: movl %eax, %ecx
; GFNIAVX2-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; GFNIAVX2-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; GFNIAVX2-NEXT: shrl %eax
+; GFNIAVX2-NEXT: andl $1431655765, %eax # imm = 0x55555555
; GFNIAVX2-NEXT: leal (%rax,%rcx,2), %eax
; GFNIAVX2-NEXT: retq
;
@@ -459,18 +459,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; GFNIAVX512F-NEXT: movl %edi, %eax
; GFNIAVX512F-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; GFNIAVX512F-NEXT: shll $4, %eax
-; GFNIAVX512F-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0
; GFNIAVX512F-NEXT: shrl $4, %edi
+; GFNIAVX512F-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; GFNIAVX512F-NEXT: orl %eax, %edi
; GFNIAVX512F-NEXT: movl %edi, %eax
; GFNIAVX512F-NEXT: andl $858993459, %eax # imm = 0x33333333
-; GFNIAVX512F-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC
; GFNIAVX512F-NEXT: shrl $2, %edi
+; GFNIAVX512F-NEXT: andl $858993459, %edi # imm = 0x33333333
; GFNIAVX512F-NEXT: leal (%rdi,%rax,4), %eax
; GFNIAVX512F-NEXT: movl %eax, %ecx
; GFNIAVX512F-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; GFNIAVX512F-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; GFNIAVX512F-NEXT: shrl %eax
+; GFNIAVX512F-NEXT: andl $1431655765, %eax # imm = 0x55555555
; GFNIAVX512F-NEXT: leal (%rax,%rcx,2), %eax
; GFNIAVX512F-NEXT: retq
;
@@ -481,18 +481,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; GFNIAVX512BW-NEXT: movl %edi, %eax
; GFNIAVX512BW-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; GFNIAVX512BW-NEXT: shll $4, %eax
-; GFNIAVX512BW-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0
; GFNIAVX512BW-NEXT: shrl $4, %edi
+; GFNIAVX512BW-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; GFNIAVX512BW-NEXT: orl %eax, %edi
; GFNIAVX512BW-NEXT: movl %edi, %eax
; GFNIAVX512BW-NEXT: andl $858993459, %eax # imm = 0x33333333
-; GFNIAVX512BW-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC
; GFNIAVX512BW-NEXT: shrl $2, %edi
+; GFNIAVX512BW-NEXT: andl $858993459, %edi # imm = 0x33333333
; GFNIAVX512BW-NEXT: leal (%rdi,%rax,4), %eax
; GFNIAVX512BW-NEXT: movl %eax, %ecx
; GFNIAVX512BW-NEXT: andl $1431655765, %ecx # imm = 0x55555555
-; GFNIAVX512BW-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
; GFNIAVX512BW-NEXT: shrl %eax
+; GFNIAVX512BW-NEXT: andl $1431655765, %eax # imm = 0x55555555
; GFNIAVX512BW-NEXT: leal (%rax,%rcx,2), %eax
; GFNIAVX512BW-NEXT: retq
%b = call i32 @llvm.bitreverse.i32(i32 %a)
@@ -503,49 +503,49 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
; SSE-LABEL: test_bitreverse_i64:
; SSE: # %bb.0:
; SSE-NEXT: bswapq %rdi
-; SSE-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; SSE-NEXT: andq %rdi, %rax
-; SSE-NEXT: shlq $4, %rax
-; SSE-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; SSE-NEXT: andq %rdi, %rcx
-; SSE-NEXT: shrq $4, %rcx
-; SSE-NEXT: orq %rax, %rcx
-; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: shrq $4, %rax
+; SSE-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; SSE-NEXT: andq %rcx, %rax
-; SSE-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; SSE-NEXT: andq %rcx, %rdx
-; SSE-NEXT: shrq $2, %rdx
-; SSE-NEXT: leaq (%rdx,%rax,4), %rax
-; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; SSE-NEXT: andq %rcx, %rdi
+; SSE-NEXT: shlq $4, %rdi
+; SSE-NEXT: orq %rax, %rdi
+; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; SSE-NEXT: movq %rdi, %rcx
; SSE-NEXT: andq %rax, %rcx
-; SSE-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; SSE-NEXT: andq %rax, %rdx
-; SSE-NEXT: shrq %rdx
-; SSE-NEXT: leaq (%rdx,%rcx,2), %rax
+; SSE-NEXT: shrq $2, %rdi
+; SSE-NEXT: andq %rax, %rdi
+; SSE-NEXT: leaq (%rdi,%rcx,4), %rax
+; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; SSE-NEXT: movq %rax, %rdx
+; SSE-NEXT: andq %rcx, %rdx
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: andq %rcx, %rax
+; SSE-NEXT: leaq (%rax,%rdx,2), %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_bitreverse_i64:
; AVX: # %bb.0:
; AVX-NEXT: bswapq %rdi
-; AVX-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; AVX-NEXT: andq %rdi, %rax
-; AVX-NEXT: shlq $4, %rax
-; AVX-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; AVX-NEXT: andq %rdi, %rcx
-; AVX-NEXT: shrq $4, %rcx
-; AVX-NEXT: orq %rax, %rcx
-; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: shrq $4, %rax
+; AVX-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; AVX-NEXT: andq %rcx, %rax
-; AVX-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; AVX-NEXT: andq %rcx, %rdx
-; AVX-NEXT: shrq $2, %rdx
-; AVX-NEXT: leaq (%rdx,%rax,4), %rax
-; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; AVX-NEXT: andq %rcx, %rdi
+; AVX-NEXT: shlq $4, %rdi
+; AVX-NEXT: orq %rax, %rdi
+; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; AVX-NEXT: movq %rdi, %rcx
; AVX-NEXT: andq %rax, %rcx
-; AVX-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; AVX-NEXT: andq %rax, %rdx
-; AVX-NEXT: shrq %rdx
-; AVX-NEXT: leaq (%rdx,%rcx,2), %rax
+; AVX-NEXT: shrq $2, %rdi
+; AVX-NEXT: andq %rax, %rdi
+; AVX-NEXT: leaq (%rdi,%rcx,4), %rax
+; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; AVX-NEXT: movq %rax, %rdx
+; AVX-NEXT: andq %rcx, %rdx
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: andq %rcx, %rax
+; AVX-NEXT: leaq (%rax,%rdx,2), %rax
; AVX-NEXT: retq
;
; XOP-LABEL: test_bitreverse_i64:
@@ -558,121 +558,121 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
; GFNISSE-LABEL: test_bitreverse_i64:
; GFNISSE: # %bb.0:
; GFNISSE-NEXT: bswapq %rdi
-; GFNISSE-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; GFNISSE-NEXT: andq %rdi, %rax
-; GFNISSE-NEXT: shlq $4, %rax
-; GFNISSE-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; GFNISSE-NEXT: andq %rdi, %rcx
-; GFNISSE-NEXT: shrq $4, %rcx
-; GFNISSE-NEXT: orq %rax, %rcx
-; GFNISSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNISSE-NEXT: movq %rdi, %rax
+; GFNISSE-NEXT: shrq $4, %rax
+; GFNISSE-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; GFNISSE-NEXT: andq %rcx, %rax
-; GFNISSE-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; GFNISSE-NEXT: andq %rcx, %rdx
-; GFNISSE-NEXT: shrq $2, %rdx
-; GFNISSE-NEXT: leaq (%rdx,%rax,4), %rax
-; GFNISSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNISSE-NEXT: andq %rcx, %rdi
+; GFNISSE-NEXT: shlq $4, %rdi
+; GFNISSE-NEXT: orq %rax, %rdi
+; GFNISSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNISSE-NEXT: movq %rdi, %rcx
; GFNISSE-NEXT: andq %rax, %rcx
-; GFNISSE-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; GFNISSE-NEXT: andq %rax, %rdx
-; GFNISSE-NEXT: shrq %rdx
-; GFNISSE-NEXT: leaq (%rdx,%rcx,2), %rax
+; GFNISSE-NEXT: shrq $2, %rdi
+; GFNISSE-NEXT: andq %rax, %rdi
+; GFNISSE-NEXT: leaq (%rdi,%rcx,4), %rax
+; GFNISSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNISSE-NEXT: movq %rax, %rdx
+; GFNISSE-NEXT: andq %rcx, %rdx
+; GFNISSE-NEXT: shrq %rax
+; GFNISSE-NEXT: andq %rcx, %rax
+; GFNISSE-NEXT: leaq (%rax,%rdx,2), %rax
; GFNISSE-NEXT: retq
;
; GFNIAVX-LABEL: test_bitreverse_i64:
; GFNIAVX: # %bb.0:
; GFNIAVX-NEXT: bswapq %rdi
-; GFNIAVX-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; GFNIAVX-NEXT: andq %rdi, %rax
-; GFNIAVX-NEXT: shlq $4, %rax
-; GFNIAVX-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; GFNIAVX-NEXT: andq %rdi, %rcx
-; GFNIAVX-NEXT: shrq $4, %rcx
-; GFNIAVX-NEXT: orq %rax, %rcx
-; GFNIAVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX-NEXT: movq %rdi, %rax
+; GFNIAVX-NEXT: shrq $4, %rax
+; GFNIAVX-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; GFNIAVX-NEXT: andq %rcx, %rax
-; GFNIAVX-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; GFNIAVX-NEXT: andq %rcx, %rdx
-; GFNIAVX-NEXT: shrq $2, %rdx
-; GFNIAVX-NEXT: leaq (%rdx,%rax,4), %rax
-; GFNIAVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX-NEXT: andq %rcx, %rdi
+; GFNIAVX-NEXT: shlq $4, %rdi
+; GFNIAVX-NEXT: orq %rax, %rdi
+; GFNIAVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX-NEXT: movq %rdi, %rcx
; GFNIAVX-NEXT: andq %rax, %rcx
-; GFNIAVX-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; GFNIAVX-NEXT: andq %rax, %rdx
-; GFNIAVX-NEXT: shrq %rdx
-; GFNIAVX-NEXT: leaq (%rdx,%rcx,2), %rax
+; GFNIAVX-NEXT: shrq $2, %rdi
+; GFNIAVX-NEXT: andq %rax, %rdi
+; GFNIAVX-NEXT: leaq (%rdi,%rcx,4), %rax
+; GFNIAVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX-NEXT: movq %rax, %rdx
+; GFNIAVX-NEXT: andq %rcx, %rdx
+; GFNIAVX-NEXT: shrq %rax
+; GFNIAVX-NEXT: andq %rcx, %rax
+; GFNIAVX-NEXT: leaq (%rax,%rdx,2), %rax
; GFNIAVX-NEXT: retq
;
; GFNIAVX2-LABEL: test_bitreverse_i64:
; GFNIAVX2: # %bb.0:
; GFNIAVX2-NEXT: bswapq %rdi
-; GFNIAVX2-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; GFNIAVX2-NEXT: andq %rdi, %rax
-; GFNIAVX2-NEXT: shlq $4, %rax
-; GFNIAVX2-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; GFNIAVX2-NEXT: andq %rdi, %rcx
-; GFNIAVX2-NEXT: shrq $4, %rcx
-; GFNIAVX2-NEXT: orq %rax, %rcx
-; GFNIAVX2-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX2-NEXT: movq %rdi, %rax
+; GFNIAVX2-NEXT: shrq $4, %rax
+; GFNIAVX2-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; GFNIAVX2-NEXT: andq %rcx, %rax
-; GFNIAVX2-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; GFNIAVX2-NEXT: andq %rcx, %rdx
-; GFNIAVX2-NEXT: shrq $2, %rdx
-; GFNIAVX2-NEXT: leaq (%rdx,%rax,4), %rax
-; GFNIAVX2-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX2-NEXT: andq %rcx, %rdi
+; GFNIAVX2-NEXT: shlq $4, %rdi
+; GFNIAVX2-NEXT: orq %rax, %rdi
+; GFNIAVX2-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX2-NEXT: movq %rdi, %rcx
; GFNIAVX2-NEXT: andq %rax, %rcx
-; GFNIAVX2-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; GFNIAVX2-NEXT: andq %rax, %rdx
-; GFNIAVX2-NEXT: shrq %rdx
-; GFNIAVX2-NEXT: leaq (%rdx,%rcx,2), %rax
+; GFNIAVX2-NEXT: shrq $2, %rdi
+; GFNIAVX2-NEXT: andq %rax, %rdi
+; GFNIAVX2-NEXT: leaq (%rdi,%rcx,4), %rax
+; GFNIAVX2-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX2-NEXT: movq %rax, %rdx
+; GFNIAVX2-NEXT: andq %rcx, %rdx
+; GFNIAVX2-NEXT: shrq %rax
+; GFNIAVX2-NEXT: andq %rcx, %rax
+; GFNIAVX2-NEXT: leaq (%rax,%rdx,2), %rax
; GFNIAVX2-NEXT: retq
;
; GFNIAVX512F-LABEL: test_bitreverse_i64:
; GFNIAVX512F: # %bb.0:
; GFNIAVX512F-NEXT: bswapq %rdi
-; GFNIAVX512F-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; GFNIAVX512F-NEXT: andq %rdi, %rax
-; GFNIAVX512F-NEXT: shlq $4, %rax
-; GFNIAVX512F-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; GFNIAVX512F-NEXT: andq %rdi, %rcx
-; GFNIAVX512F-NEXT: shrq $4, %rcx
-; GFNIAVX512F-NEXT: orq %rax, %rcx
-; GFNIAVX512F-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX512F-NEXT: movq %rdi, %rax
+; GFNIAVX512F-NEXT: shrq $4, %rax
+; GFNIAVX512F-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; GFNIAVX512F-NEXT: andq %rcx, %rax
-; GFNIAVX512F-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; GFNIAVX512F-NEXT: andq %rcx, %rdx
-; GFNIAVX512F-NEXT: shrq $2, %rdx
-; GFNIAVX512F-NEXT: leaq (%rdx,%rax,4), %rax
-; GFNIAVX512F-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX512F-NEXT: andq %rcx, %rdi
+; GFNIAVX512F-NEXT: shlq $4, %rdi
+; GFNIAVX512F-NEXT: orq %rax, %rdi
+; GFNIAVX512F-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX512F-NEXT: movq %rdi, %rcx
; GFNIAVX512F-NEXT: andq %rax, %rcx
-; GFNIAVX512F-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; GFNIAVX512F-NEXT: andq %rax, %rdx
-; GFNIAVX512F-NEXT: shrq %rdx
-; GFNIAVX512F-NEXT: leaq (%rdx,%rcx,2), %rax
+; GFNIAVX512F-NEXT: shrq $2, %rdi
+; GFNIAVX512F-NEXT: andq %rax, %rdi
+; GFNIAVX512F-NEXT: leaq (%rdi,%rcx,4), %rax
+; GFNIAVX512F-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX512F-NEXT: movq %rax, %rdx
+; GFNIAVX512F-NEXT: andq %rcx, %rdx
+; GFNIAVX512F-NEXT: shrq %rax
+; GFNIAVX512F-NEXT: andq %rcx, %rax
+; GFNIAVX512F-NEXT: leaq (%rax,%rdx,2), %rax
; GFNIAVX512F-NEXT: retq
;
; GFNIAVX512BW-LABEL: test_bitreverse_i64:
; GFNIAVX512BW: # %bb.0:
; GFNIAVX512BW-NEXT: bswapq %rdi
-; GFNIAVX512BW-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
-; GFNIAVX512BW-NEXT: andq %rdi, %rax
-; GFNIAVX512BW-NEXT: shlq $4, %rax
-; GFNIAVX512BW-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; GFNIAVX512BW-NEXT: andq %rdi, %rcx
-; GFNIAVX512BW-NEXT: shrq $4, %rcx
-; GFNIAVX512BW-NEXT: orq %rax, %rcx
-; GFNIAVX512BW-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX512BW-NEXT: movq %rdi, %rax
+; GFNIAVX512BW-NEXT: shrq $4, %rax
+; GFNIAVX512BW-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; GFNIAVX512BW-NEXT: andq %rcx, %rax
-; GFNIAVX512BW-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
-; GFNIAVX512BW-NEXT: andq %rcx, %rdx
-; GFNIAVX512BW-NEXT: shrq $2, %rdx
-; GFNIAVX512BW-NEXT: leaq (%rdx,%rax,4), %rax
-; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT: andq %rcx, %rdi
+; GFNIAVX512BW-NEXT: shlq $4, %rdi
+; GFNIAVX512BW-NEXT: orq %rax, %rdi
+; GFNIAVX512BW-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; GFNIAVX512BW-NEXT: movq %rdi, %rcx
; GFNIAVX512BW-NEXT: andq %rax, %rcx
-; GFNIAVX512BW-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
-; GFNIAVX512BW-NEXT: andq %rax, %rdx
-; GFNIAVX512BW-NEXT: shrq %rdx
-; GFNIAVX512BW-NEXT: leaq (%rdx,%rcx,2), %rax
+; GFNIAVX512BW-NEXT: shrq $2, %rdi
+; GFNIAVX512BW-NEXT: andq %rax, %rdi
+; GFNIAVX512BW-NEXT: leaq (%rdi,%rcx,4), %rax
+; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT: movq %rax, %rdx
+; GFNIAVX512BW-NEXT: andq %rcx, %rdx
+; GFNIAVX512BW-NEXT: shrq %rax
+; GFNIAVX512BW-NEXT: andq %rcx, %rax
+; GFNIAVX512BW-NEXT: leaq (%rax,%rdx,2), %rax
; GFNIAVX512BW-NEXT: retq
%b = call i64 @llvm.bitreverse.i64(i64 %a)
ret i64 %b
@@ -687,16 +687,18 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: psllw $2, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: psllw $2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: paddb %xmm0, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -775,16 +777,18 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: psllw $2, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: psllw $2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: paddb %xmm0, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -875,16 +879,18 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: psllw $2, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: psllw $2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: paddb %xmm0, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -977,16 +983,18 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: psllw $2, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: psllw $2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: paddb %xmm0, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -1071,38 +1079,38 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: psllw $2, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: psllw $2, %xmm0
; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: psrlw $1, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: psllw $4, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psllw $4, %xmm5
; SSE2-NEXT: psrlw $4, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: psllw $2, %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm4
-; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm2
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: psllw $2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v32i8:
@@ -1248,42 +1256,42 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: psllw $2, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: psllw $2, %xmm0
; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT: movdqa %xmm0, %xmm7
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: psrlw $1, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: psrlw $8, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $8, %xmm5
; SSE2-NEXT: psllw $8, %xmm2
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: psllw $4, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psllw $4, %xmm5
; SSE2-NEXT: psrlw $4, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm7, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: psllw $2, %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm4
-; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm2
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: psllw $2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v16i16:
@@ -1434,63 +1442,63 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v8i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psllw $4, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $4, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm5
; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: psllw $2, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
-; SSE2-NEXT: pand %xmm8, %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: psrlw $2, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: psllw $2, %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm5, %xmm6
; SSE2-NEXT: psrlw $1, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: paddb %xmm0, %xmm0
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
; SSE2-NEXT: packuswb %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psllw $4, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: psllw $4, %xmm3
; SSE2-NEXT: psrlw $4, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: psllw $2, %xmm3
-; SSE2-NEXT: pand %xmm8, %xmm1
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: psrlw $1, %xmm5
-; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: psllw $2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v8i32:
@@ -1641,67 +1649,67 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v4i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psllw $4, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $4, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm5
; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: psllw $2, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
-; SSE2-NEXT: pand %xmm8, %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: psrlw $2, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: psllw $2, %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm5, %xmm6
; SSE2-NEXT: psrlw $1, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: paddb %xmm0, %xmm0
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
; SSE2-NEXT: packuswb %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psllw $4, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: psllw $4, %xmm3
; SSE2-NEXT: psrlw $4, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: psllw $2, %xmm3
-; SSE2-NEXT: pand %xmm8, %xmm1
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: psrlw $1, %xmm5
-; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: psllw $2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v4i64:
@@ -1851,7 +1859,7 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v64i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm3, %xmm10
+; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: psllw $4, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -1860,76 +1868,76 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: psllw $2, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
+; SSE2-NEXT: psrlw $2, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: pand %xmm8, %xmm6
; SSE2-NEXT: pand %xmm8, %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: psllw $2, %xmm0
; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
; SSE2-NEXT: movdqa %xmm0, %xmm7
-; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: psrlw $1, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; SSE2-NEXT: pand %xmm9, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: paddb %xmm0, %xmm0
; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm7
; SSE2-NEXT: psllw $4, %xmm7
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm7, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pandn %xmm7, %xmm5
; SSE2-NEXT: psrlw $4, %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: psllw $2, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $2, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: pand %xmm8, %xmm1
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm4
-; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: psllw $2, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pand %xmm6, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psllw $4, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psllw $4, %xmm5
; SSE2-NEXT: movdqa %xmm3, %xmm7
-; SSE2-NEXT: pandn %xmm4, %xmm7
+; SSE2-NEXT: pandn %xmm5, %xmm7
; SSE2-NEXT: psrlw $4, %xmm2
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: psllw $2, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $2, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: psrlw $2, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm4
-; SSE2-NEXT: pand %xmm9, %xmm2
+; SSE2-NEXT: psllw $2, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pand %xmm6, %xmm2
; SSE2-NEXT: paddb %xmm2, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm10, %xmm4
-; SSE2-NEXT: psllw $4, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm10
-; SSE2-NEXT: pand %xmm3, %xmm10
-; SSE2-NEXT: pandn %xmm4, %xmm3
-; SSE2-NEXT: por %xmm10, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: psllw $2, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: psllw $4, %xmm5
+; SSE2-NEXT: psrlw $4, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm5, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
; SSE2-NEXT: pand %xmm8, %xmm3
-; SSE2-NEXT: psrlw $2, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm6
-; SSE2-NEXT: psrlw $1, %xmm6
-; SSE2-NEXT: pand %xmm9, %xmm3
+; SSE2-NEXT: psllw $2, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm3
; SSE2-NEXT: paddb %xmm3, %xmm3
-; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v64i8:
@@ -2152,20 +2160,18 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm10, %xmm6
-; SSE2-NEXT: psllw $2, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
+; SSE2-NEXT: psrlw $2, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: pand %xmm8, %xmm6
; SSE2-NEXT: pand %xmm8, %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: psllw $2, %xmm0
; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
; SSE2-NEXT: movdqa %xmm0, %xmm7
-; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: psrlw $1, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; SSE2-NEXT: pand %xmm9, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: paddb %xmm0, %xmm0
; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm7
@@ -2180,15 +2186,15 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pand %xmm10, %xmm5
-; SSE2-NEXT: psllw $2, %xmm5
+; SSE2-NEXT: psrlw $2, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: pand %xmm8, %xmm1
-; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: psllw $2, %xmm1
; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pand %xmm6, %xmm5
; SSE2-NEXT: psrlw $1, %xmm5
-; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pand %xmm6, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm1
; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -2203,15 +2209,15 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: por %xmm7, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pand %xmm10, %xmm5
-; SSE2-NEXT: psllw $2, %xmm5
+; SSE2-NEXT: psrlw $2, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: psrlw $2, %xmm2
+; SSE2-NEXT: psllw $2, %xmm2
; SSE2-NEXT: por %xmm5, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pand %xmm6, %xmm5
; SSE2-NEXT: psrlw $1, %xmm5
-; SSE2-NEXT: pand %xmm9, %xmm2
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pand %xmm6, %xmm2
; SSE2-NEXT: paddb %xmm2, %xmm2
; SSE2-NEXT: por %xmm5, %xmm2
; SSE2-NEXT: movdqa %xmm4, %xmm5
@@ -2224,16 +2230,18 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; SSE2-NEXT: pand %xmm3, %xmm4
; SSE2-NEXT: pandn %xmm5, %xmm3
; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm10
-; SSE2-NEXT: psllw $2, %xmm10
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
; SSE2-NEXT: pand %xmm8, %xmm3
-; SSE2-NEXT: psrlw $2, %xmm3
-; SSE2-NEXT: por %xmm10, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm6
-; SSE2-NEXT: psrlw $1, %xmm6
-; SSE2-NEXT: pand %xmm9, %xmm3
+; SSE2-NEXT: psllw $2, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm3
; SSE2-NEXT: paddb %xmm3, %xmm3
-; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v32i16:
@@ -2478,118 +2486,118 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v16i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm3, %xmm11
-; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm8
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: psllw $4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psllw $4, %xmm6
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE2-NEXT: movdqa %xmm3, %xmm7
-; SSE2-NEXT: pandn %xmm5, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm7
; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm7
-; SSE2-NEXT: pand %xmm5, %xmm7
-; SSE2-NEXT: psllw $2, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
-; SSE2-NEXT: pand %xmm8, %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm7, %xmm6
-; SSE2-NEXT: psrlw $1, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT: psrlw $2, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: pand %xmm9, %xmm7
; SSE2-NEXT: pand %xmm9, %xmm0
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT: pand %xmm7, %xmm5
+; SSE2-NEXT: pand %xmm7, %xmm0
; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: packuswb %xmm6, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: psllw $4, %xmm6
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: packuswb %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $4, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: pandn %xmm5, %xmm6
; SSE2-NEXT: psrlw $4, %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: psllw $2, %xmm4
-; SSE2-NEXT: pand %xmm8, %xmm1
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $2, %xmm5
+; SSE2-NEXT: pand %xmm9, %xmm5
; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: psllw $2, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: pand %xmm7, %xmm5
+; SSE2-NEXT: pand %xmm7, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: packuswb %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psllw $4, %xmm4
+; SSE2-NEXT: packuswb %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psllw $4, %xmm5
; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pandn %xmm4, %xmm6
+; SSE2-NEXT: pandn %xmm5, %xmm6
; SSE2-NEXT: psrlw $4, %xmm2
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: psllw $2, %xmm4
-; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: psrlw $2, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $2, %xmm5
+; SSE2-NEXT: pand %xmm9, %xmm5
; SSE2-NEXT: pand %xmm9, %xmm2
+; SSE2-NEXT: psllw $2, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: pand %xmm7, %xmm5
+; SSE2-NEXT: pand %xmm7, %xmm2
; SSE2-NEXT: paddb %xmm2, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm11, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: packuswb %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: psllw $4, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm6
-; SSE2-NEXT: pand %xmm3, %xmm6
-; SSE2-NEXT: pandn %xmm4, %xmm3
-; SSE2-NEXT: por %xmm6, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: psllw $2, %xmm5
-; SSE2-NEXT: pand %xmm8, %xmm3
-; SSE2-NEXT: psrlw $2, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm7
-; SSE2-NEXT: psrlw $1, %xmm7
+; SSE2-NEXT: packuswb %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: psllw $4, %xmm5
+; SSE2-NEXT: psrlw $4, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm5, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm4
+; SSE2-NEXT: pand %xmm9, %xmm4
; SSE2-NEXT: pand %xmm9, %xmm3
+; SSE2-NEXT: psllw $2, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm7, %xmm3
; SSE2-NEXT: paddb %xmm3, %xmm3
-; SSE2-NEXT: por %xmm7, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v16i32:
@@ -2834,126 +2842,126 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v8i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm3, %xmm11
-; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm8
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: psllw $4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psllw $4, %xmm6
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE2-NEXT: movdqa %xmm3, %xmm7
-; SSE2-NEXT: pandn %xmm5, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm7
; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm7
-; SSE2-NEXT: pand %xmm5, %xmm7
-; SSE2-NEXT: psllw $2, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
-; SSE2-NEXT: pand %xmm8, %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm7, %xmm6
-; SSE2-NEXT: psrlw $1, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT: psrlw $2, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: pand %xmm9, %xmm7
; SSE2-NEXT: pand %xmm9, %xmm0
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NEXT: pand %xmm7, %xmm5
+; SSE2-NEXT: pand %xmm7, %xmm0
; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: packuswb %xmm6, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: psllw $4, %xmm6
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: packuswb %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $4, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: pandn %xmm5, %xmm6
; SSE2-NEXT: psrlw $4, %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: psllw $2, %xmm4
-; SSE2-NEXT: pand %xmm8, %xmm1
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $2, %xmm5
+; SSE2-NEXT: pand %xmm9, %xmm5
; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: psllw $2, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: pand %xmm7, %xmm5
+; SSE2-NEXT: pand %xmm7, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: packuswb %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psllw $4, %xmm4
+; SSE2-NEXT: packuswb %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psllw $4, %xmm5
; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pandn %xmm4, %xmm6
+; SSE2-NEXT: pandn %xmm5, %xmm6
; SSE2-NEXT: psrlw $4, %xmm2
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: psllw $2, %xmm4
-; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: psrlw $2, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $2, %xmm5
+; SSE2-NEXT: pand %xmm9, %xmm5
; SSE2-NEXT: pand %xmm9, %xmm2
+; SSE2-NEXT: psllw $2, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: pand %xmm7, %xmm5
+; SSE2-NEXT: pand %xmm7, %xmm2
; SSE2-NEXT: paddb %xmm2, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm11, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: packuswb %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: psllw $4, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm6
-; SSE2-NEXT: pand %xmm3, %xmm6
-; SSE2-NEXT: pandn %xmm4, %xmm3
-; SSE2-NEXT: por %xmm6, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: psllw $2, %xmm5
-; SSE2-NEXT: pand %xmm8, %xmm3
-; SSE2-NEXT: psrlw $2, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm7
-; SSE2-NEXT: psrlw $1, %xmm7
+; SSE2-NEXT: packuswb %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: psllw $4, %xmm5
+; SSE2-NEXT: psrlw $4, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm5, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm4
+; SSE2-NEXT: pand %xmm9, %xmm4
; SSE2-NEXT: pand %xmm9, %xmm3
+; SSE2-NEXT: psllw $2, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm7, %xmm3
; SSE2-NEXT: paddb %xmm3, %xmm3
-; SSE2-NEXT: por %xmm7, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v8i64:
More information about the llvm-commits
mailing list