[llvm] 529bd4f - [DAG] SimplifyDemandedBits - don't early-out for multiple use values
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 27 02:54:24 PDT 2022
Author: Simon Pilgrim
Date: 2022-07-27T10:54:06+01:00
New Revision: 529bd4f3525d7ff564012597df204d8a8a4fb9ac
URL: https://github.com/llvm/llvm-project/commit/529bd4f3525d7ff564012597df204d8a8a4fb9ac
DIFF: https://github.com/llvm/llvm-project/commit/529bd4f3525d7ff564012597df204d8a8a4fb9ac.diff
LOG: [DAG] SimplifyDemandedBits - don't early-out for multiple use values
SimplifyDemandedBits currently early-outs for multi-use values beyond the root node (just returning the knownbits), which is missing a number of optimizations as there are plenty of cases where we can still simplify when initially demanding all elements/bits.
@lenary has confirmed that the test cases in aea-erratum-fix.ll need refactoring and the current increase codegen is not a major concern.
Differential Revision: https://reviews.llvm.org/D129765
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
llvm/test/CodeGen/AMDGPU/udiv64.ll
llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
llvm/test/CodeGen/ARM/aes-erratum-fix.ll
llvm/test/CodeGen/RISCV/rv32zbp.ll
llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
llvm/test/CodeGen/Thumb2/mve-vabdus.ll
llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
llvm/test/CodeGen/Thumb2/mve-vst3.ll
llvm/test/CodeGen/X86/combine-bitreverse.ll
llvm/test/CodeGen/X86/dagcombine-cse.ll
llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
llvm/test/CodeGen/X86/smul-with-overflow.ll
llvm/test/CodeGen/X86/smul_fix_sat_constants.ll
llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 6205e74837c04..102c412fe72fc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1089,6 +1089,10 @@ bool TargetLowering::SimplifyDemandedBits(
if (Op.isUndef())
return false;
+ // We can't simplify target constants.
+ if (Op.getOpcode() == ISD::TargetConstant)
+ return false;
+
if (Op.getOpcode() == ISD::Constant) {
// We know all of the bits for a constant!
Known = KnownBits::makeConstant(cast<ConstantSDNode>(Op)->getAPIntValue());
@@ -1103,17 +1107,16 @@ bool TargetLowering::SimplifyDemandedBits(
}
// Other users may use these bits.
+ bool HasMultiUse = false;
if (!Op.getNode()->hasOneUse() && !AssumeSingleUse) {
- if (Depth != 0) {
- // If not at the root, Just compute the Known bits to
- // simplify things downstream.
- Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+ if (Depth >= SelectionDAG::MaxRecursionDepth) {
+ // Limit search depth.
return false;
}
- // If this is the root being simplified, allow it to have multiple uses,
- // just set the DemandedBits/Elts to all bits.
+ // Allow multiple uses, just set the DemandedBits/Elts to all bits.
DemandedBits = APInt::getAllOnes(BitWidth);
DemandedElts = APInt::getAllOnes(NumElts);
+ HasMultiUse = true;
} else if (OriginalDemandedBits == 0 || OriginalDemandedElts == 0) {
// Not demanding any bits/elts from Op.
return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
@@ -1124,8 +1127,6 @@ bool TargetLowering::SimplifyDemandedBits(
KnownBits Known2;
switch (Op.getOpcode()) {
- case ISD::TargetConstant:
- llvm_unreachable("Can't simplify this node");
case ISD::SCALAR_TO_VECTOR: {
if (!DemandedElts[0])
return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
@@ -2715,6 +2716,12 @@ bool TargetLowering::SimplifyDemandedBits(
APFloat(TLO.DAG.EVTToAPFloatSemantics(VT), Known.One), dl, VT));
}
+ // A multi use 'all demanded elts' simplify failed to find any knownbits.
+ // Try again just for the original demanded elts.
+ // Ensure we do this AFTER constant folding above.
+ if (HasMultiUse && Known.isUnknown() && !OriginalDemandedElts.isAllOnes())
+ Known = TLO.DAG.computeKnownBits(Op, OriginalDemandedElts, Depth);
+
return false;
}
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 6e30267162b96..427b1fed1f307 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -2616,36 +2616,36 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: mov w8, #1895825407
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: mov x25, #-34359738368
-; CHECK-NEXT: mov x23, #34359738367
+; CHECK-NEXT: mov x22, #34359738367
; CHECK-NEXT: fmov s9, w8
; CHECK-NEXT: fcmp s8, s10
; CHECK-NEXT: mov h0, v0.h[3]
-; CHECK-NEXT: csel x8, xzr, x0, lt
-; CHECK-NEXT: csel x9, x25, x1, lt
+; CHECK-NEXT: csel x8, x25, x1, lt
+; CHECK-NEXT: csel x9, xzr, x0, lt
; CHECK-NEXT: fcmp s8, s9
-; CHECK-NEXT: csel x9, x23, x9, gt
-; CHECK-NEXT: csinv x8, x8, xzr, le
+; CHECK-NEXT: csinv x9, x9, xzr, le
+; CHECK-NEXT: csel x8, x22, x8, gt
; CHECK-NEXT: fcmp s8, s8
; CHECK-NEXT: fcvt s8, h0
; CHECK-NEXT: csel x8, xzr, x8, vs
; CHECK-NEXT: fmov s0, s8
-; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill
-; CHECK-NEXT: csel x8, xzr, x9, vs
; CHECK-NEXT: str x8, [sp, #72] // 8-byte Folded Spill
+; CHECK-NEXT: csel x8, xzr, x9, vs
+; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill
; CHECK-NEXT: bl __fixsfti
; CHECK-NEXT: fcmp s8, s10
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: csel x8, xzr, x0, lt
; CHECK-NEXT: csel x9, x25, x1, lt
; CHECK-NEXT: fcmp s8, s9
-; CHECK-NEXT: csel x9, x23, x9, gt
+; CHECK-NEXT: csel x9, x22, x9, gt
; CHECK-NEXT: csinv x8, x8, xzr, le
; CHECK-NEXT: fcmp s8, s8
; CHECK-NEXT: fcvt s8, h0
-; CHECK-NEXT: csel x8, xzr, x8, vs
-; CHECK-NEXT: csel x22, xzr, x9, vs
+; CHECK-NEXT: csel x10, xzr, x8, vs
+; CHECK-NEXT: csel x8, xzr, x9, vs
; CHECK-NEXT: fmov s0, s8
-; CHECK-NEXT: str x8, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: stp x8, x10, [sp, #8] // 16-byte Folded Spill
; CHECK-NEXT: bl __fixsfti
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: fcmp s8, s10
@@ -2654,10 +2654,10 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: csel x9, xzr, x0, lt
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: csinv x9, x9, xzr, le
-; CHECK-NEXT: csel x8, x23, x8, gt
+; CHECK-NEXT: csel x8, x22, x8, gt
; CHECK-NEXT: fcmp s8, s8
; CHECK-NEXT: fcvt s8, h0
-; CHECK-NEXT: csel x24, xzr, x8, vs
+; CHECK-NEXT: csel x26, xzr, x8, vs
; CHECK-NEXT: csel x8, xzr, x9, vs
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: str x8, [sp, #32] // 8-byte Folded Spill
@@ -2669,40 +2669,39 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: csel x9, xzr, x0, lt
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: csinv x9, x9, xzr, le
-; CHECK-NEXT: csel x8, x23, x8, gt
+; CHECK-NEXT: csel x8, x22, x8, gt
; CHECK-NEXT: fcmp s8, s8
; CHECK-NEXT: fcvt s8, h0
-; CHECK-NEXT: csel x27, xzr, x8, vs
+; CHECK-NEXT: csel x28, xzr, x8, vs
; CHECK-NEXT: csel x8, xzr, x9, vs
; CHECK-NEXT: fmov s0, s8
-; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill
; CHECK-NEXT: bl __fixsfti
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: fcmp s8, s10
; CHECK-NEXT: mov h0, v0.h[3]
-; CHECK-NEXT: csel x8, xzr, x0, lt
-; CHECK-NEXT: csel x9, x25, x1, lt
+; CHECK-NEXT: csel x8, x25, x1, lt
+; CHECK-NEXT: csel x9, xzr, x0, lt
; CHECK-NEXT: fcmp s8, s9
-; CHECK-NEXT: csel x9, x23, x9, gt
-; CHECK-NEXT: csinv x8, x8, xzr, le
+; CHECK-NEXT: csinv x9, x9, xzr, le
+; CHECK-NEXT: csel x8, x22, x8, gt
; CHECK-NEXT: fcmp s8, s8
; CHECK-NEXT: fcvt s8, h0
-; CHECK-NEXT: csel x8, xzr, x8, vs
-; CHECK-NEXT: csel x29, xzr, x9, vs
+; CHECK-NEXT: csel x27, xzr, x8, vs
+; CHECK-NEXT: csel x20, xzr, x9, vs
; CHECK-NEXT: fmov s0, s8
-; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill
; CHECK-NEXT: bl __fixsfti
; CHECK-NEXT: fcmp s8, s10
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: csel x8, xzr, x0, lt
; CHECK-NEXT: csel x9, x25, x1, lt
; CHECK-NEXT: fcmp s8, s9
-; CHECK-NEXT: csel x9, x23, x9, gt
+; CHECK-NEXT: csel x9, x22, x9, gt
; CHECK-NEXT: csinv x8, x8, xzr, le
; CHECK-NEXT: fcmp s8, s8
; CHECK-NEXT: fcvt s8, h0
-; CHECK-NEXT: csel x20, xzr, x8, vs
-; CHECK-NEXT: csel x28, xzr, x9, vs
+; CHECK-NEXT: csel x29, xzr, x8, vs
+; CHECK-NEXT: csel x21, xzr, x9, vs
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixsfti
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
@@ -2712,65 +2711,54 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: csel x9, xzr, x0, lt
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: csinv x9, x9, xzr, le
-; CHECK-NEXT: csel x8, x23, x8, gt
+; CHECK-NEXT: csel x8, x22, x8, gt
; CHECK-NEXT: fcmp s8, s8
; CHECK-NEXT: fcvt s8, h0
-; CHECK-NEXT: csel x21, xzr, x8, vs
-; CHECK-NEXT: csel x26, xzr, x9, vs
+; CHECK-NEXT: csel x23, xzr, x8, vs
+; CHECK-NEXT: csel x24, xzr, x9, vs
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixsfti
-; CHECK-NEXT: fmov d0, x20
; CHECK-NEXT: fcmp s8, s10
-; CHECK-NEXT: ldr x11, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT: lsr x10, x28, #28
-; CHECK-NEXT: ldr d1, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: lsr x12, x29, #28
-; CHECK-NEXT: mov v0.d[1], x28
+; CHECK-NEXT: extr x9, x21, x29, #28
+; CHECK-NEXT: bfi x23, x20, #36, #28
+; CHECK-NEXT: extr x11, x27, x20, #28
+; CHECK-NEXT: str x24, [x19]
; CHECK-NEXT: csel x8, x25, x1, lt
-; CHECK-NEXT: csel x9, xzr, x0, lt
+; CHECK-NEXT: csel x10, xzr, x0, lt
; CHECK-NEXT: fcmp s8, s9
-; CHECK-NEXT: stur x11, [x19, #75]
-; CHECK-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT: csinv x9, x9, xzr, le
-; CHECK-NEXT: csel x8, x23, x8, gt
+; CHECK-NEXT: stur x9, [x19, #41]
+; CHECK-NEXT: stp x23, x11, [x19, #8]
+; CHECK-NEXT: lsr x11, x27, #28
+; CHECK-NEXT: csinv x9, x10, xzr, le
+; CHECK-NEXT: lsr x10, x21, #28
+; CHECK-NEXT: csel x8, x22, x8, gt
; CHECK-NEXT: fcmp s8, s8
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: stur x13, [x19, #50]
-; CHECK-NEXT: mov v1.d[1], x29
-; CHECK-NEXT: ldr d0, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT: csel x9, xzr, x9, vs
; CHECK-NEXT: strb w10, [x19, #49]
-; CHECK-NEXT: extr x10, x28, x11, #28
; CHECK-NEXT: csel x8, xzr, x8, vs
-; CHECK-NEXT: bfi x8, x11, #36, #28
-; CHECK-NEXT: strb w12, [x19, #24]
+; CHECK-NEXT: ldr x10, [sp] // 8-byte Folded Reload
+; CHECK-NEXT: csel x9, xzr, x9, vs
+; CHECK-NEXT: bfi x8, x29, #36, #28
+; CHECK-NEXT: strb w11, [x19, #24]
+; CHECK-NEXT: stur x10, [x19, #75]
+; CHECK-NEXT: ldp x12, x11, [sp, #8] // 16-byte Folded Reload
; CHECK-NEXT: stur x9, [x19, #25]
-; CHECK-NEXT: fmov x12, d1
-; CHECK-NEXT: stur x10, [x19, #41]
-; CHECK-NEXT: lsr x9, x22, #28
-; CHECK-NEXT: ldr d1, [sp, #24] // 8-byte Folded Reload
; CHECK-NEXT: stur x8, [x19, #33]
+; CHECK-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT: extr x10, x12, x11, #28
+; CHECK-NEXT: bfi x28, x11, #36, #28
+; CHECK-NEXT: stur x8, [x19, #50]
+; CHECK-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload
; CHECK-NEXT: ldr x11, [sp, #72] // 8-byte Folded Reload
-; CHECK-NEXT: extr x18, x29, x12, #28
-; CHECK-NEXT: mov v0.d[1], x22
-; CHECK-NEXT: bfi x21, x12, #36, #28
-; CHECK-NEXT: str x26, [x19]
-; CHECK-NEXT: mov v1.d[1], x11
-; CHECK-NEXT: lsr x10, x11, #28
-; CHECK-NEXT: mov x13, x11
-; CHECK-NEXT: stp x21, x18, [x19, #8]
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: strb w9, [x19, #99]
-; CHECK-NEXT: strb w10, [x19, #74]
-; CHECK-NEXT: fmov x11, d1
-; CHECK-NEXT: extr x12, x22, x8, #28
-; CHECK-NEXT: bfi x27, x8, #36, #28
-; CHECK-NEXT: extr x8, x13, x11, #28
-; CHECK-NEXT: bfi x24, x11, #36, #28
-; CHECK-NEXT: stur x12, [x19, #91]
-; CHECK-NEXT: stur x27, [x19, #83]
+; CHECK-NEXT: stur x10, [x19, #91]
+; CHECK-NEXT: stur x28, [x19, #83]
+; CHECK-NEXT: extr x8, x11, x9, #28
+; CHECK-NEXT: bfi x26, x9, #36, #28
+; CHECK-NEXT: lsr x9, x12, #28
; CHECK-NEXT: stur x8, [x19, #66]
-; CHECK-NEXT: stur x24, [x19, #58]
+; CHECK-NEXT: lsr x8, x11, #28
+; CHECK-NEXT: stur x26, [x19, #58]
+; CHECK-NEXT: strb w9, [x19, #99]
+; CHECK-NEXT: strb w8, [x19, #74]
; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload
; CHECK-NEXT: ldp x22, x21, [sp, #160] // 16-byte Folded Reload
; CHECK-NEXT: ldp x24, x23, [sp, #144] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index 35b78615aa7f0..e669ea5a26522 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -2195,28 +2195,28 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: mov w8, #1904214015
; CHECK-NEXT: fcmp s8, #0.0
-; CHECK-NEXT: mov x21, #68719476735
+; CHECK-NEXT: mov x23, #68719476735
; CHECK-NEXT: mov h0, v0.h[3]
; CHECK-NEXT: fmov s9, w8
-; CHECK-NEXT: csel x8, xzr, x1, lt
-; CHECK-NEXT: csel x9, xzr, x0, lt
+; CHECK-NEXT: csel x8, xzr, x0, lt
+; CHECK-NEXT: csel x9, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: fcvt s8, h0
-; CHECK-NEXT: csinv x9, x9, xzr, le
-; CHECK-NEXT: csel x20, x21, x8, gt
+; CHECK-NEXT: csel x9, x23, x9, gt
+; CHECK-NEXT: csinv x8, x8, xzr, le
; CHECK-NEXT: fmov s0, s8
-; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT: stp x8, x9, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: bl __fixunssfti
; CHECK-NEXT: fcmp s8, #0.0
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: csel x8, xzr, x1, lt
-; CHECK-NEXT: csel x9, xzr, x0, lt
+; CHECK-NEXT: csel x8, xzr, x0, lt
+; CHECK-NEXT: csel x9, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: fcvt s8, h0
-; CHECK-NEXT: csinv x9, x9, xzr, le
-; CHECK-NEXT: csel x23, x21, x8, gt
+; CHECK-NEXT: csel x9, x23, x9, gt
+; CHECK-NEXT: csinv x24, x8, xzr, le
; CHECK-NEXT: fmov s0, s8
-; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: bl __fixunssfti
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: fcmp s8, #0.0
@@ -2226,7 +2226,7 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: fcvt s8, h0
; CHECK-NEXT: csinv x8, x8, xzr, le
-; CHECK-NEXT: csel x24, x21, x9, gt
+; CHECK-NEXT: csel x25, x23, x9, gt
; CHECK-NEXT: str x8, [sp, #32] // 8-byte Folded Spill
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixunssfti
@@ -2238,29 +2238,29 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: fcvt s8, h0
; CHECK-NEXT: csinv x8, x8, xzr, le
-; CHECK-NEXT: csel x26, x21, x9, gt
-; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: csel x27, x23, x9, gt
+; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixunssfti
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: fcmp s8, #0.0
; CHECK-NEXT: mov h0, v0.h[3]
-; CHECK-NEXT: csel x8, xzr, x1, lt
-; CHECK-NEXT: csel x9, xzr, x0, lt
+; CHECK-NEXT: csel x8, xzr, x0, lt
+; CHECK-NEXT: csel x9, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: fcvt s8, h0
-; CHECK-NEXT: csinv x29, x9, xzr, le
-; CHECK-NEXT: csel x28, x21, x8, gt
+; CHECK-NEXT: csel x29, x23, x9, gt
+; CHECK-NEXT: csinv x26, x8, xzr, le
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixunssfti
; CHECK-NEXT: fcmp s8, #0.0
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: csel x8, xzr, x1, lt
-; CHECK-NEXT: csel x9, xzr, x0, lt
+; CHECK-NEXT: csel x8, xzr, x0, lt
+; CHECK-NEXT: csel x9, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: fcvt s8, h0
-; CHECK-NEXT: csinv x27, x9, xzr, le
-; CHECK-NEXT: csel x22, x21, x8, gt
+; CHECK-NEXT: csel x28, x23, x9, gt
+; CHECK-NEXT: csinv x20, x8, xzr, le
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixunssfti
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
@@ -2270,58 +2270,46 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: csel x9, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: fcvt s8, h0
-; CHECK-NEXT: csinv x8, x8, xzr, le
-; CHECK-NEXT: csel x25, x21, x9, gt
-; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill
+; CHECK-NEXT: csel x21, x23, x9, gt
+; CHECK-NEXT: csinv x22, x8, xzr, le
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixunssfti
-; CHECK-NEXT: ldr x11, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT: fmov d0, x27
-; CHECK-NEXT: fmov d1, x29
; CHECK-NEXT: fcmp s8, #0.0
-; CHECK-NEXT: lsr x10, x22, #28
-; CHECK-NEXT: stur x11, [x19, #75]
-; CHECK-NEXT: lsr x11, x28, #28
-; CHECK-NEXT: mov v0.d[1], x22
-; CHECK-NEXT: ldr x12, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT: mov v1.d[1], x28
+; CHECK-NEXT: extr x8, x28, x20, #28
+; CHECK-NEXT: bfi x21, x26, #36, #28
+; CHECK-NEXT: extr x9, x29, x26, #28
+; CHECK-NEXT: lsr x11, x29, #28
+; CHECK-NEXT: str x22, [x19]
+; CHECK-NEXT: stur x8, [x19, #41]
; CHECK-NEXT: csel x8, xzr, x0, lt
-; CHECK-NEXT: csel x9, xzr, x1, lt
+; CHECK-NEXT: csel x10, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
-; CHECK-NEXT: stur x12, [x19, #50]
-; CHECK-NEXT: fmov x12, d0
-; CHECK-NEXT: fmov x13, d1
+; CHECK-NEXT: stp x21, x9, [x19, #8]
+; CHECK-NEXT: lsr x9, x28, #28
+; CHECK-NEXT: strb w11, [x19, #24]
+; CHECK-NEXT: bfi x27, x24, #36, #28
+; CHECK-NEXT: csel x10, x23, x10, gt
; CHECK-NEXT: csinv x8, x8, xzr, le
-; CHECK-NEXT: ldp d0, d1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: csel x9, x21, x9, gt
-; CHECK-NEXT: strb w10, [x19, #49]
-; CHECK-NEXT: extr x10, x22, x12, #28
-; CHECK-NEXT: bfi x9, x12, #36, #28
+; CHECK-NEXT: bfi x10, x20, #36, #28
+; CHECK-NEXT: strb w9, [x19, #49]
; CHECK-NEXT: stur x8, [x19, #25]
-; CHECK-NEXT: extr x8, x28, x13, #28
-; CHECK-NEXT: mov v0.d[1], x23
-; CHECK-NEXT: strb w11, [x19, #24]
-; CHECK-NEXT: mov v1.d[1], x20
-; CHECK-NEXT: stur x10, [x19, #41]
-; CHECK-NEXT: stur x9, [x19, #33]
-; CHECK-NEXT: bfi x25, x13, #36, #28
-; CHECK-NEXT: str x8, [x19, #16]
-; CHECK-NEXT: lsr x9, x23, #28
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: ldr x12, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: fmov x11, d1
-; CHECK-NEXT: lsr x10, x20, #28
-; CHECK-NEXT: strb w9, [x19, #99]
-; CHECK-NEXT: stp x12, x25, [x19]
-; CHECK-NEXT: extr x12, x23, x8, #28
-; CHECK-NEXT: bfi x26, x8, #36, #28
-; CHECK-NEXT: extr x8, x20, x11, #28
-; CHECK-NEXT: bfi x24, x11, #36, #28
-; CHECK-NEXT: strb w10, [x19, #74]
-; CHECK-NEXT: stur x12, [x19, #91]
-; CHECK-NEXT: stur x26, [x19, #83]
-; CHECK-NEXT: stur x8, [x19, #66]
-; CHECK-NEXT: stur x24, [x19, #58]
+; CHECK-NEXT: stur x10, [x19, #33]
+; CHECK-NEXT: ldp x9, x12, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: stur x9, [x19, #75]
+; CHECK-NEXT: extr x8, x12, x24, #28
+; CHECK-NEXT: ldr x9, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT: stur x9, [x19, #50]
+; CHECK-NEXT: ldp x11, x10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: stur x8, [x19, #91]
+; CHECK-NEXT: lsr x8, x12, #28
+; CHECK-NEXT: stur x27, [x19, #83]
+; CHECK-NEXT: extr x9, x10, x11, #28
+; CHECK-NEXT: bfi x25, x11, #36, #28
+; CHECK-NEXT: strb w8, [x19, #99]
+; CHECK-NEXT: stur x9, [x19, #66]
+; CHECK-NEXT: lsr x9, x10, #28
+; CHECK-NEXT: stur x25, [x19, #58]
+; CHECK-NEXT: strb w9, [x19, #74]
; CHECK-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload
; CHECK-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload
; CHECK-NEXT: ldp x24, x23, [sp, #128] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index de75f84110a52..1883db6c3dde6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -2753,67 +2753,63 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s8, s6, 0xffff
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT: v_mov_b32_e32 v4, s6
-; GFX6-NEXT: v_alignbit_b32 v4, s7, v4, 16
+; GFX6-NEXT: s_and_b32 s9, s6, 0xffff
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9
; GFX6-NEXT: s_and_b32 s8, s4, 0xffff
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v4
-; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8
+; GFX6-NEXT: s_lshr_b32 s9, s6, 16
+; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8
; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0
-; GFX6-NEXT: v_cvt_f32_u32_e32 v5, v5
-; GFX6-NEXT: v_mov_b32_e32 v1, s4
-; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v1
-; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3
-; GFX6-NEXT: v_cvt_f32_u32_e32 v6, v6
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5
+; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9
+; GFX6-NEXT: s_lshr_b32 s8, s4, 16
+; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s8
+; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
; GFX6-NEXT: v_trunc_f32_e32 v3, v3
-; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2
-; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7
-; GFX6-NEXT: v_trunc_f32_e32 v2, v2
-; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
-; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2
-; GFX6-NEXT: v_mad_f32 v2, -v2, v5, v6
+; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3
+; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5
+; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GFX6-NEXT: v_trunc_f32_e32 v1, v1
; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v5
+; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1
+; GFX6-NEXT: v_mad_f32 v1, -v1, v2, v4
; GFX6-NEXT: s_and_b32 s6, s7, 0xffff
-; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2
+; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX6-NEXT: s_and_b32 s6, s5, 0xffff
-; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4
-; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9
+; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: s_lshr_b32 s4, s7, 16
-; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v2, v1
-; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5
-; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s8, v1
+; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4
+; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
; GFX6-NEXT: s_lshr_b32 s6, s5, 16
; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s6
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5
-; GFX6-NEXT: v_mad_f32 v4, -v1, v3, v4
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4
+; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3
-; GFX6-NEXT: v_mul_f32_e32 v3, v6, v7
-; GFX6-NEXT: v_trunc_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7
+; GFX6-NEXT: v_trunc_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT: v_mad_f32 v3, -v3, v5, v6
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7
-; GFX6-NEXT: v_mul_lo_u32 v3, v3, s4
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -3029,7 +3025,7 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
; GFX6-NEXT: s_sext_i32_i16 s6, s5
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v1, v3
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v1
; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6
; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2
; GFX6-NEXT: s_xor_b32 s4, s6, s4
@@ -3045,7 +3041,7 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
; GFX6-NEXT: s_ashr_i32 s5, s5, 16
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5
; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
; GFX6-NEXT: s_xor_b32 s4, s5, s4
@@ -3280,74 +3276,73 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0|
; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT: v_mov_b32_e32 v1, s4
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s6
-; GFX6-NEXT: v_alignbit_b32 v2, s7, v2, 16
-; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
-; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3
-; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16
-; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16
-; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4
+; GFX6-NEXT: s_ashr_i32 s9, s6, 16
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3
-; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3
-; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7
-; GFX6-NEXT: v_trunc_f32_e32 v5, v5
-; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6
-; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5
+; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9
+; GFX6-NEXT: s_lshr_b32 s8, s4, 16
+; GFX6-NEXT: s_lshr_b32 s6, s6, 16
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT: v_or_b32_e32 v3, 1, v3
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4|
-; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; GFX6-NEXT: s_ashr_i32 s4, s4, 16
+; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1
+; GFX6-NEXT: s_xor_b32 s4, s4, s9
+; GFX6-NEXT: s_ashr_i32 s4, s4, 30
+; GFX6-NEXT: s_or_b32 s4, s4, 1
+; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3
+; GFX6-NEXT: v_trunc_f32_e32 v3, v3
+; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2
+; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX6-NEXT: v_mov_b32_e32 v4, s4
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1|
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GFX6-NEXT: s_sext_i32_i16 s4, s7
-; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2
-; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4
+; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6
+; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
; GFX6-NEXT: s_sext_i32_i16 s6, s5
; GFX6-NEXT: s_xor_b32 s4, s6, s4
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v1
+; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2
; GFX6-NEXT: s_ashr_i32 s4, s4, 30
; GFX6-NEXT: s_or_b32 s4, s4, 1
; GFX6-NEXT: v_mov_b32_e32 v5, s4
-; GFX6-NEXT: v_mul_f32_e32 v4, v2, v4
+; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4
; GFX6-NEXT: v_trunc_f32_e32 v4, v4
-; GFX6-NEXT: v_mad_f32 v2, -v4, v3, v2
+; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1
; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4
; GFX6-NEXT: s_ashr_i32 s4, s7, 16
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v3|
-; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2|
+; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7
; GFX6-NEXT: s_lshr_b32 s6, s7, 16
; GFX6-NEXT: s_ashr_i32 s7, s5, 16
; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s7
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
; GFX6-NEXT: s_xor_b32 s4, s7, s4
; GFX6-NEXT: s_ashr_i32 s4, s4, 30
; GFX6-NEXT: s_or_b32 s4, s4, 1
; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5
; GFX6-NEXT: v_trunc_f32_e32 v5, v5
-; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4
+; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4
; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5
; GFX6-NEXT: v_mov_b32_e32 v6, s4
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3|
-; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT: v_mul_lo_u32 v3, v3, s6
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2|
+; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT: v_mul_lo_u32 v2, v2, s6
; GFX6-NEXT: s_lshr_b32 s4, s5, 16
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
@@ -3635,7 +3630,7 @@ define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0|
; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -3719,7 +3714,7 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0|
; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
; GFX6-NEXT: s_lshr_b32 s3, s4, 8
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
@@ -3999,54 +3994,50 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s8, s6, 0xffff
+; GFX6-NEXT: s_and_b32 s9, s6, 0xffff
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9
+; GFX6-NEXT: s_and_b32 s8, s4, 0xffff
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8
-; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
-; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2
-; GFX6-NEXT: v_mov_b32_e32 v2, s6
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1
-; GFX6-NEXT: v_alignbit_b32 v2, s7, v2, 16
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v2
-; GFX6-NEXT: v_cvt_f32_u32_e32 v5, v5
-; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4
-; GFX6-NEXT: v_trunc_f32_e32 v4, v4
-; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v4
-; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc
-; GFX6-NEXT: v_alignbit_b32 v0, s5, v0, 16
-; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v0
-; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1
+; GFX6-NEXT: s_lshr_b32 s9, s6, 16
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9
+; GFX6-NEXT: s_lshr_b32 s8, s4, 16
+; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s8
+; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3
+; GFX6-NEXT: v_trunc_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3
+; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6
+; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5
+; GFX6-NEXT: v_trunc_f32_e32 v1, v1
+; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: s_and_b32 s4, s7, 0xffff
-; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4
-; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4
-; GFX6-NEXT: v_trunc_f32_e32 v4, v4
-; GFX6-NEXT: v_mad_f32 v3, -v4, v5, v3
-; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
; GFX6-NEXT: s_and_b32 s4, s5, 0xffff
-; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s4
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2
-; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8
-; GFX6-NEXT: v_trunc_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3
-; GFX6-NEXT: v_mad_f32 v3, -v3, v6, v7
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6
-; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT: v_mul_lo_u32 v3, v3, s7
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v3
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4
+; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6
+; GFX6-NEXT: v_trunc_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2
+; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9
+; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s8, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -4225,7 +4216,7 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
; GFX6-NEXT: s_sext_i32_i16 s5, s5
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5
; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2
; GFX6-NEXT: s_xor_b32 s4, s5, s4
@@ -4415,49 +4406,48 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0|
; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_ashr_i32 s9, s6, 16
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v2, s6
-; GFX6-NEXT: v_alignbit_b32 v2, s7, v2, 16
-; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
-; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3
-; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16
-; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16
-; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4
; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3
-; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3
-; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7
-; GFX6-NEXT: v_trunc_f32_e32 v5, v5
+; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9
+; GFX6-NEXT: s_lshr_b32 s8, s4, 16
+; GFX6-NEXT: s_lshr_b32 s6, s6, 16
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6
-; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5
+; GFX6-NEXT: s_ashr_i32 s4, s4, 16
+; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1
+; GFX6-NEXT: s_xor_b32 s4, s4, s9
+; GFX6-NEXT: s_ashr_i32 s4, s4, 30
+; GFX6-NEXT: s_or_b32 s4, s4, 1
+; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3
+; GFX6-NEXT: v_trunc_f32_e32 v3, v3
+; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2
+; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX6-NEXT: v_mov_b32_e32 v4, s4
; GFX6-NEXT: s_sext_i32_i16 s4, s7
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4|
-; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4
-; GFX6-NEXT: v_or_b32_e32 v3, 1, v3
-; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1|
+; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6
; GFX6-NEXT: s_sext_i32_i16 s6, s5
-; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2
; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s6
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v4
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2
; GFX6-NEXT: s_xor_b32 s4, s6, s4
; GFX6-NEXT: s_ashr_i32 s4, s4, 30
; GFX6-NEXT: s_or_b32 s4, s4, 1
-; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5
-; GFX6-NEXT: v_trunc_f32_e32 v5, v5
-; GFX6-NEXT: v_mad_f32 v3, -v5, v4, v3
-; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5
-; GFX6-NEXT: v_mov_b32_e32 v6, s4
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4|
-; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; GFX6-NEXT: v_mul_lo_u32 v3, v3, s7
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4
+; GFX6-NEXT: v_trunc_f32_e32 v4, v4
+; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3
+; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, s4
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2|
+; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s8, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v3
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4
@@ -5026,7 +5016,7 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GFX6-NEXT: v_mov_b32_e32 v5, s1
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc
; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1
; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3
; GFX6-NEXT: s_xor_b32 s0, s1, s0
@@ -5251,7 +5241,7 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4|
; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc
; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GFX6-NEXT: v_mul_lo_u32 v4, v4, s0
; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f
; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s0
@@ -5274,7 +5264,7 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15
-; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0
; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2
@@ -5287,11 +5277,11 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6|
; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX6-NEXT: v_mul_lo_u32 v5, v5, s8
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3
; GFX6-NEXT: s_lshr_b32 s3, s2, 15
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v5
-; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30
; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4
@@ -5404,7 +5394,7 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -5683,9 +5673,9 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1
; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1
; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2
; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0
@@ -6490,7 +6480,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX6-NEXT: s_xor_b32 s11, s0, s1
; GFX6-NEXT: s_sub_i32 s0, 0, s10
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1
; GFX6-NEXT: v_mul_lo_u32 v3, v0, s2
@@ -6504,7 +6494,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
; GFX6-NEXT: s_ashr_i32 s0, s9, 31
; GFX6-NEXT: s_add_i32 s1, s9, s0
; GFX6-NEXT: s_xor_b32 s1, s1, s0
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0
; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3
@@ -6954,7 +6944,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX6-NEXT: s_xor_b32 s4, s5, s9
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v1
; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0
; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
@@ -7134,9 +7124,9 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
; GFX6-NEXT: v_mul_hi_u32 v3, v0, s9
; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9
; GFX6-NEXT: v_mov_b32_e32 v5, 0x11f
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GFX6-NEXT: v_mul_lo_u32 v3, v0, s9
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3
; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -8217,7 +8207,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5
-; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3
; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2
@@ -8548,9 +8538,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0
; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0
; GFX6-NEXT: v_mov_b32_e32 v5, s11
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3
; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -9297,9 +9287,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4
; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4
; GFX6-NEXT: v_mov_b32_e32 v7, s3
; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5
@@ -10528,8 +10518,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0
; GFX6-NEXT: v_mul_lo_u32 v2, s4, v2
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v3
; GFX6-NEXT: v_mov_b32_e32 v5, s5
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index fdf6342c4b400..bbc023f92d099 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1735,94 +1735,94 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v7, 0
+; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x5
-; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:6
+; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3
; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:1
-; GFX10-NEXT: global_load_short_d16 v4, v0, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1
+; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4
; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(5)
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
; GFX10-NEXT: s_waitcnt vmcnt(3)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v5
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v4
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
+; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v7
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] offset:16
-; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
+; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX9-LABEL: load_v7i8_to_v7f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:4
-; GFX9-NEXT: global_load_ubyte v2, v0, s[0:1] offset:6
+; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:6
+; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:4
; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] offset:3
-; GFX9-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2
-; GFX9-NEXT: global_load_ubyte v5, v0, s[0:1] offset:1
-; GFX9-NEXT: global_load_ubyte v7, v0, s[0:1]
+; GFX9-NEXT: global_load_ubyte v7, v0, s[0:1] offset:2
+; GFX9-NEXT: global_load_ubyte v8, v0, s[0:1] offset:1
+; GFX9-NEXT: global_load_ubyte v9, v0, s[0:1]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v1
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v1
; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v2
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v2
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v2
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v7
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v8
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
-; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v9
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v9
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
-; GFX9-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
+; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[0:1] offset:16
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: load_v7i8_to_v7f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c
-; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_lshlrev_b32 v0, 3, v0
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x5
-; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:6
+; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6
; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3
; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2
-; GFX11-NEXT: global_load_u8 v6, v0, s[2:3] offset:1
-; GFX11-NEXT: global_load_d16_b16 v4, v0, s[2:3] offset:4
+; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:1
+; GFX11-NEXT: global_load_d16_b16 v7, v0, s[2:3] offset:4
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(5)
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
-; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v5
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v4
-; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
+; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v7
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b96 v7, v[4:6], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b96 v8, v[4:6], s[0:1] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
index f4c2b2f060fd9..267cea2181c23 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -51,11 +51,10 @@ define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
; SI-NEXT: s_mov_b32 m0, -1
; SI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4
; SI-NEXT: v_sub_i32_e32 v0, vcc, 12, v1
-; SI-NEXT: v_sub_i32_e32 v2, vcc, 16, v1
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_barrier
-; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v2
+; SI-NEXT: v_sub_i32_e32 v2, vcc, 28, v1
; SI-NEXT: ds_read_b32 v0, v0
; SI-NEXT: ds_read_b32 v3, v2
; SI-NEXT: s_mov_b32 s3, 0xf000
@@ -77,16 +76,13 @@ define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_barrier
-; CI-NEXT: v_sub_i32_e32 v2, vcc, 16, v1
-; CI-NEXT: ds_read_b32 v0, v0 offset:12
-; CI-NEXT: ds_read_b32 v3, v2 offset:12
+; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:3 offset1:7
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_mov_b32_e32 v2, 0
-; CI-NEXT: s_waitcnt lgkmcnt(1)
-; CI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:16
+; CI-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64
+; CI-NEXT: buffer_store_dword v4, v[1:2], s[0:3], 0 addr64 offset:16
; CI-NEXT: s_endpgm
entry:
%x.i = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 21c8260cbfefd..14b89416a721a 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -38,8 +38,8 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out,
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
@@ -85,8 +85,8 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out,
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 20ece3d1c1a56..418dfbcb5cd2e 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -56,19 +56,22 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s5
; HAWAII-NEXT: flat_load_ubyte v0, v[0:1]
-; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0
-; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2
-; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3
+; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x3
+; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x0
+; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x2
; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
-; HAWAII-NEXT: v_mov_b32_e32 v1, s0
-; HAWAII-NEXT: v_mov_b32_e32 v2, s1
+; HAWAII-NEXT: s_and_b32 s3, s0, 0xffff
+; HAWAII-NEXT: v_mov_b32_e32 v1, s1
+; HAWAII-NEXT: v_mov_b32_e32 v2, s0
; HAWAII-NEXT: v_mov_b32_e32 v3, s2
-; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4
+; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4
; HAWAII-NEXT: s_waitcnt vmcnt(0)
-; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; HAWAII-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; HAWAII-NEXT: v_or_b32_e32 v0, s3, v0
+; HAWAII-NEXT: v_bfe_u32 v0, v0, 16, 7
; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6
-; HAWAII-NEXT: ds_write_b32 v1, v2
+; HAWAII-NEXT: ds_write_b32 v1, v3
; HAWAII-NEXT: s_endpgm
;
; FIJI-LABEL: local_store_i55:
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index e1dc9904b8123..f55f000e7d074 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -675,7 +675,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-NEXT: s_load_dword s2, s[0:1], 0xe
; GCN-NEXT: s_load_dword s4, s[0:1], 0xd
; GCN-NEXT: s_load_dword s6, s[0:1], 0xc
-; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, 0xffff
+; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s3, s2, 0xffff
@@ -687,7 +687,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-NEXT: s_load_dword s0, s[0:1], 0xb
; GCN-NEXT: s_and_b32 s8, s6, 0xffff
; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2
+; GCN-NEXT: v_mac_f32_e32 v1, 0, v2
; GCN-NEXT: v_rcp_f32_e32 v1, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s9, s0, 0xff000000
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 7e295cb0fb413..14d3503cce6e5 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -234,9 +234,8 @@ define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg)
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s1, s0, 0xffff
+; VI-NEXT: s_add_i32 s1, s0, 12
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_add_i32 s1, s1, 12
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; VI-NEXT: s_or_b32 s0, s1, 4
; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
diff --git a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll
index 32bc6bc67d955..afd9a929f75ad 100644
--- a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll
+++ b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll
@@ -1356,54 +1356,77 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
; CHECK-FIX-NOSCHED-NEXT: .save {r4, r5, r6, r7, r8, lr}
; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r8, lr}
; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_2
+; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_3
; CHECK-FIX-NOSCHED-NEXT: @ %bb.1:
; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2]
-; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16]
-; CHECK-FIX-NOSCHED-NEXT: vmov r3, r4, d17
-; CHECK-FIX-NOSCHED-NEXT: vmov lr, r12, d16
-; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3
-; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT: vorr q9, q8, q8
+; CHECK-FIX-NOSCHED-NEXT: vmov lr, r12, d17
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d16[1]
+; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d18[0]}, [r1:16]
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r4, d18[0]
+; CHECK-FIX-NOSCHED-NEXT: vmov s6, lr
+; CHECK-FIX-NOSCHED-NEXT: lsr r5, r3, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s8, s6
-; CHECK-FIX-NOSCHED-NEXT: vmov s4, r4
-; CHECK-FIX-NOSCHED-NEXT: lsr r4, r4, #16
-; CHECK-FIX-NOSCHED-NEXT: vmov s10, r3
-; CHECK-FIX-NOSCHED-NEXT: vmov s6, r12
-; CHECK-FIX-NOSCHED-NEXT: lsr r12, r12, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s6
-; CHECK-FIX-NOSCHED-NEXT: vmov s6, lr
+; CHECK-FIX-NOSCHED-NEXT: lsr r3, r12, #16
; CHECK-FIX-NOSCHED-NEXT: lsr lr, lr, #16
-; CHECK-FIX-NOSCHED-NEXT: vmov s14, r12
-; CHECK-FIX-NOSCHED-NEXT: vmov s7, lr
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s6
-; CHECK-FIX-NOSCHED-NEXT: vmov s6, r4
+; CHECK-FIX-NOSCHED-NEXT: vmov s4, r12
+; CHECK-FIX-NOSCHED-NEXT: vmov s10, lr
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT: vmov s14, r5
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s10
+; CHECK-FIX-NOSCHED-NEXT: vmov s6, r4
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s14
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s6
+; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3
+; CHECK-FIX-NOSCHED-NEXT: lsr r3, r4, #16
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT: vmov s7, r3
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7
; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT: bne .LBB36_3
-; CHECK-FIX-NOSCHED-NEXT: b .LBB36_4
+; CHECK-FIX-NOSCHED-NEXT: bne .LBB36_4
; CHECK-FIX-NOSCHED-NEXT: .LBB36_2:
-; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r2, #10]
-; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r2, #6]
+; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d0
+; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d1
+; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1
+; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s1
+; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3
+; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s1, r0
+; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s3, r3
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s1
+; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FIX-NOSCHED-NEXT: vmov s13, r1
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2
+; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT: b .LBB36_5
+; CHECK-FIX-NOSCHED-NEXT: .LBB36_3:
+; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r2, #10]
+; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r2, #6]
; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r2, #2]
; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r2, #14]
-; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4
+; CHECK-FIX-NOSCHED-NEXT: vmov s8, r5
; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #12]
-; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr
-; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r2, #8]
+; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12
+; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r2, #8]
; CHECK-FIX-NOSCHED-NEXT: vmov s5, r6
-; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r2, #4]
+; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r2, #4]
; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7
; CHECK-FIX-NOSCHED-NEXT: ldrh r8, [r2]
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s4
; CHECK-FIX-NOSCHED-NEXT: vmov s4, r3
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s8
-; CHECK-FIX-NOSCHED-NEXT: vmov s8, r12
+; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s12
-; CHECK-FIX-NOSCHED-NEXT: vmov s12, r5
+; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s5
; CHECK-FIX-NOSCHED-NEXT: vmov s5, r8
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4
@@ -1411,44 +1434,46 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s12
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5
; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_4
-; CHECK-FIX-NOSCHED-NEXT: .LBB36_3:
-; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d0[0]}, [r1:16]
+; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_2
; CHECK-FIX-NOSCHED-NEXT: .LBB36_4:
-; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d0
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s5
-; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d1
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8
-; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1
-; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16
+; CHECK-FIX-NOSCHED-NEXT: vorr q8, q0, q0
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d0[1]
+; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16]
+; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d1
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r7, d16[0]
+; CHECK-FIX-NOSCHED-NEXT: vmov s1, r3
+; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s1
-; CHECK-FIX-NOSCHED-NEXT: vmov s1, r0
+; CHECK-FIX-NOSCHED-NEXT: vmov s13, r3
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT: vmov s0, r1
+; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s2, r0
; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16
-; CHECK-FIX-NOSCHED-NEXT: vmov s13, r1
-; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0
+; CHECK-FIX-NOSCHED-NEXT: vmov s3, r0
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7
+; CHECK-FIX-NOSCHED-NEXT: lsr r0, r7, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s1
+; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1
+; CHECK-FIX-NOSCHED-NEXT: .LBB36_5:
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s5
+; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0
; CHECK-FIX-NOSCHED-NEXT: vmov r0, s5
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s7
; CHECK-FIX-NOSCHED-NEXT: vmov r1, s5
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s15, s15
-; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3
-; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s15
-; CHECK-FIX-NOSCHED-NEXT: vmov s3, r3
-; CHECK-FIX-NOSCHED-NEXT: vmov r3, s5
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s11, s11
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT: vmov r3, s5
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s9
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s2, s2
-; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7
-; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1
; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r1, lsl #16
; CHECK-FIX-NOSCHED-NEXT: vmov r1, s11
; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0
@@ -1494,39 +1519,63 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
; CHECK-CORTEX-FIX: @ %bb.0:
; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r8, lr}
; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r8, lr}
-; CHECK-CORTEX-FIX-NEXT: .vsave {d8}
-; CHECK-CORTEX-FIX-NEXT: vpush {d8}
+; CHECK-CORTEX-FIX-NEXT: .vsave {d8, d9}
+; CHECK-CORTEX-FIX-NEXT: vpush {d8, d9}
; CHECK-CORTEX-FIX-NEXT: cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT: beq .LBB36_2
+; CHECK-CORTEX-FIX-NEXT: beq .LBB36_3
; CHECK-CORTEX-FIX-NEXT: @ %bb.1:
; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2]
-; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16]
+; CHECK-CORTEX-FIX-NEXT: vorr q9, q8, q8
+; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1]
; CHECK-CORTEX-FIX-NEXT: vmov r5, r6, d17
+; CHECK-CORTEX-FIX-NEXT: vld1.16 {d18[0]}, [r1:16]
; CHECK-CORTEX-FIX-NEXT: lsr r7, r5, #16
; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16
; CHECK-CORTEX-FIX-NEXT: vmov s4, r6
; CHECK-CORTEX-FIX-NEXT: vmov s6, r5
-; CHECK-CORTEX-FIX-NEXT: vmov s14, r4
-; CHECK-CORTEX-FIX-NEXT: vmov s7, r7
-; CHECK-CORTEX-FIX-NEXT: vmov r12, r3, d16
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s4
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s6
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s14
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s7
-; CHECK-CORTEX-FIX-NEXT: lsr lr, r12, #16
; CHECK-CORTEX-FIX-NEXT: lsr r8, r3, #16
; CHECK-CORTEX-FIX-NEXT: vmov s8, r3
-; CHECK-CORTEX-FIX-NEXT: vmov s10, r12
+; CHECK-CORTEX-FIX-NEXT: vmov s12, r4
+; CHECK-CORTEX-FIX-NEXT: vmov s5, r7
; CHECK-CORTEX-FIX-NEXT: vmov s9, r8
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s4
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s6
+; CHECK-CORTEX-FIX-NEXT: vmov.32 lr, d18[0]
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s5
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s8
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9
+; CHECK-CORTEX-FIX-NEXT: lsr r12, lr, #16
; CHECK-CORTEX-FIX-NEXT: vmov s11, lr
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s8
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s10
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s9
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s11
+; CHECK-CORTEX-FIX-NEXT: vmov s13, r12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s11
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s13
; CHECK-CORTEX-FIX-NEXT: cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT: bne .LBB36_3
-; CHECK-CORTEX-FIX-NEXT: b .LBB36_4
+; CHECK-CORTEX-FIX-NEXT: bne .LBB36_4
; CHECK-CORTEX-FIX-NEXT: .LBB36_2:
+; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d1
+; CHECK-CORTEX-FIX-NEXT: vmov r0, r1, d0
+; CHECK-CORTEX-FIX-NEXT: lsr r7, r1, #16
+; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16
+; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s2, r6
+; CHECK-CORTEX-FIX-NEXT: vmov s0, r5
+; CHECK-CORTEX-FIX-NEXT: vmov s3, r1
+; CHECK-CORTEX-FIX-NEXT: vmov s9, r0
+; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT: vmov s15, r4
+; CHECK-CORTEX-FIX-NEXT: vmov s16, r7
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s0
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s3
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s9
+; CHECK-CORTEX-FIX-NEXT: vmov s0, r12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s2
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16
+; CHECK-CORTEX-FIX-NEXT: b .LBB36_5
+; CHECK-CORTEX-FIX-NEXT: .LBB36_3:
; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r2]
; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r2, #2]
; CHECK-CORTEX-FIX-NEXT: ldrh r8, [r2, #4]
@@ -1535,84 +1584,86 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #10]
; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r2, #12]
; CHECK-CORTEX-FIX-NEXT: ldrh r6, [r2, #14]
+; CHECK-CORTEX-FIX-NEXT: vmov s5, r5
+; CHECK-CORTEX-FIX-NEXT: vmov s7, r8
; CHECK-CORTEX-FIX-NEXT: vmov s4, r6
; CHECK-CORTEX-FIX-NEXT: vmov s6, r7
-; CHECK-CORTEX-FIX-NEXT: vmov s5, r5
; CHECK-CORTEX-FIX-NEXT: vmov s8, r3
-; CHECK-CORTEX-FIX-NEXT: vmov s10, r4
-; CHECK-CORTEX-FIX-NEXT: vmov s9, r8
-; CHECK-CORTEX-FIX-NEXT: vmov s11, lr
-; CHECK-CORTEX-FIX-NEXT: vmov s13, r12
+; CHECK-CORTEX-FIX-NEXT: vmov s12, r4
+; CHECK-CORTEX-FIX-NEXT: vmov s9, lr
+; CHECK-CORTEX-FIX-NEXT: vmov s11, r12
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s4
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s6
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s6
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s8
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s10
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s5
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s11
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s13
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s5
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s7
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s9
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s11
; CHECK-CORTEX-FIX-NEXT: cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT: beq .LBB36_4
-; CHECK-CORTEX-FIX-NEXT: .LBB36_3:
-; CHECK-CORTEX-FIX-NEXT: vld1.16 {d0[0]}, [r1:16]
+; CHECK-CORTEX-FIX-NEXT: beq .LBB36_2
; CHECK-CORTEX-FIX-NEXT: .LBB36_4:
+; CHECK-CORTEX-FIX-NEXT: vorr q8, q0, q0
+; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d0[1]
; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d1
-; CHECK-CORTEX-FIX-NEXT: vmov r0, r1, d0
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s5
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s4, s4
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10
-; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16
-; CHECK-CORTEX-FIX-NEXT: vmov s0, r5
-; CHECK-CORTEX-FIX-NEXT: lsr r7, r1, #16
+; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16]
; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16
-; CHECK-CORTEX-FIX-NEXT: vmov s1, r1
-; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16
-; CHECK-CORTEX-FIX-NEXT: vmov s9, r0
-; CHECK-CORTEX-FIX-NEXT: vmov r0, s12
-; CHECK-CORTEX-FIX-NEXT: vmov r1, s14
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s7
+; CHECK-CORTEX-FIX-NEXT: lsr r1, r5, #16
; CHECK-CORTEX-FIX-NEXT: vmov s2, r6
-; CHECK-CORTEX-FIX-NEXT: vmov r5, s6
-; CHECK-CORTEX-FIX-NEXT: vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT: vmov s0, r5
+; CHECK-CORTEX-FIX-NEXT: lsr r7, r3, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s3, r3
+; CHECK-CORTEX-FIX-NEXT: vmov s9, r1
; CHECK-CORTEX-FIX-NEXT: vmov s15, r4
; CHECK-CORTEX-FIX-NEXT: vmov s16, r7
-; CHECK-CORTEX-FIX-NEXT: vmov r4, s10
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s0
-; CHECK-CORTEX-FIX-NEXT: vmov s0, r12
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s1
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s9
-; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r1, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r1, s5
-; CHECK-CORTEX-FIX-NEXT: vmov r3, s12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s0
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s2
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT: vmov.32 r0, d16[0]
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s9
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s3
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s3
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s11
-; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r1, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT: vmov s18, r0
+; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s0, r12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s18
+; CHECK-CORTEX-FIX-NEXT: .LBB36_5:
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s7
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s4, s4
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s1
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s13
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9
-; CHECK-CORTEX-FIX-NEXT: vmov r3, s14
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s3
+; CHECK-CORTEX-FIX-NEXT: vmov r0, s10
+; CHECK-CORTEX-FIX-NEXT: vmov r1, s14
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s5
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s1
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s13
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s11
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15
+; CHECK-CORTEX-FIX-NEXT: vmov r5, s6
+; CHECK-CORTEX-FIX-NEXT: vmov r4, s8
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0
-; CHECK-CORTEX-FIX-NEXT: vmov r7, s3
+; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r1, lsl #16
+; CHECK-CORTEX-FIX-NEXT: vmov r1, s7
+; CHECK-CORTEX-FIX-NEXT: vmov r3, s10
+; CHECK-CORTEX-FIX-NEXT: vmov r7, s1
; CHECK-CORTEX-FIX-NEXT: vmov r6, s11
; CHECK-CORTEX-FIX-NEXT: vmov r0, s9
+; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r1, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT: vmov r3, s14
; CHECK-CORTEX-FIX-NEXT: vmov r1, s0
; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r7, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r7, s7
+; CHECK-CORTEX-FIX-NEXT: vmov r7, s5
; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r7, r6, lsl #16
; CHECK-CORTEX-FIX-NEXT: vmov r6, s4
; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r5, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r5, s8
+; CHECK-CORTEX-FIX-NEXT: vmov r5, s12
; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r4, lsl #16
; CHECK-CORTEX-FIX-NEXT: vmov r4, s2
; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r5
@@ -1620,7 +1671,7 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], lr
; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r12
; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r4, r0, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r4, s1
+; CHECK-CORTEX-FIX-NEXT: vmov r4, s3
; CHECK-CORTEX-FIX-NEXT: pkhbt r1, r4, r1, lsl #16
; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r1
; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r0
@@ -1629,7 +1680,7 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8
; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9
; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2]
-; CHECK-CORTEX-FIX-NEXT: vpop {d8}
+; CHECK-CORTEX-FIX-NEXT: vpop {d8, d9}
; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r8, pc}
br i1 %0, label %5, label %12
@@ -1680,56 +1731,78 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_val(i1 zeroext %0, half %1, <1
; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r11, lr}
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s0
; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_2
+; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_3
; CHECK-FIX-NOSCHED-NEXT: @ %bb.1:
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s9
; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1]
; CHECK-FIX-NOSCHED-NEXT: vmov r2, s0
+; CHECK-FIX-NOSCHED-NEXT: vmov lr, r12, d17
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d16[1]
; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r2
-; CHECK-FIX-NOSCHED-NEXT: vmov r3, lr, d17
-; CHECK-FIX-NOSCHED-NEXT: vmov r2, r12, d16
-; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3
-; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s2, lr
+; CHECK-FIX-NOSCHED-NEXT: lsr lr, lr, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s8, s2
-; CHECK-FIX-NOSCHED-NEXT: lsr r4, lr, #16
-; CHECK-FIX-NOSCHED-NEXT: vmov s0, lr
-; CHECK-FIX-NOSCHED-NEXT: vmov s2, r12
-; CHECK-FIX-NOSCHED-NEXT: lsr r5, r2, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r2, d16[0]
+; CHECK-FIX-NOSCHED-NEXT: lsr r4, r3, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s2
-; CHECK-FIX-NOSCHED-NEXT: vmov s2, r2
-; CHECK-FIX-NOSCHED-NEXT: lsr r2, r12, #16
-; CHECK-FIX-NOSCHED-NEXT: vmov s10, r3
-; CHECK-FIX-NOSCHED-NEXT: vmov s14, r2
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s2
-; CHECK-FIX-NOSCHED-NEXT: vmov s2, r4
+; CHECK-FIX-NOSCHED-NEXT: lsr r3, r12, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s0, r12
+; CHECK-FIX-NOSCHED-NEXT: vmov s10, lr
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-FIX-NOSCHED-NEXT: vmov s3, r5
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2
+; CHECK-FIX-NOSCHED-NEXT: vmov s14, r4
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s10
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s14
+; CHECK-FIX-NOSCHED-NEXT: vmov s2, r2
+; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s3, r2
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s2
+; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT: bne .LBB37_3
-; CHECK-FIX-NOSCHED-NEXT: b .LBB37_4
+; CHECK-FIX-NOSCHED-NEXT: bne .LBB37_4
; CHECK-FIX-NOSCHED-NEXT: .LBB37_2:
+; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d2
+; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d3
+; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2
+; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s5
+; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3
+; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s5, r0
+; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s7, r3
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s5
+; CHECK-FIX-NOSCHED-NEXT: vmov s5, r7
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4
+; CHECK-FIX-NOSCHED-NEXT: vmov s13, r2
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT: b .LBB37_5
+; CHECK-FIX-NOSCHED-NEXT: .LBB37_3:
; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #10]
-; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r1, #6]
-; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r1, #2]
+; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r1, #6]
+; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r1, #2]
; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r1, #14]
; CHECK-FIX-NOSCHED-NEXT: vmov s8, r3
; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #12]
-; CHECK-FIX-NOSCHED-NEXT: vmov s12, r4
-; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r1, #8]
-; CHECK-FIX-NOSCHED-NEXT: vmov s1, lr
-; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r1, #4]
+; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12
+; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r1, #8]
+; CHECK-FIX-NOSCHED-NEXT: vmov s1, r5
+; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r1, #4]
; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7
; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r1]
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s0
; CHECK-FIX-NOSCHED-NEXT: vmov s0, r2
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s8
-; CHECK-FIX-NOSCHED-NEXT: vmov s8, r5
+; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s12
-; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12
+; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s1
; CHECK-FIX-NOSCHED-NEXT: vmov s1, r6
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0
@@ -1737,47 +1810,48 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_val(i1 zeroext %0, half %1, <1
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s12
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1
; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_4
-; CHECK-FIX-NOSCHED-NEXT: .LBB37_3:
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s9, s9
-; CHECK-FIX-NOSCHED-NEXT: vmov r0, s9
-; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r0
+; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_2
; CHECK-FIX-NOSCHED-NEXT: .LBB37_4:
-; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d2
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s1
-; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d3
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0
-; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2
-; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s9, s9
+; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d3
+; CHECK-FIX-NOSCHED-NEXT: vmov r7, s9
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d2[1]
+; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r7
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r7, d2[0]
+; CHECK-FIX-NOSCHED-NEXT: vmov s5, r3
+; CHECK-FIX-NOSCHED-NEXT: vmov s4, r2
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s5
-; CHECK-FIX-NOSCHED-NEXT: vmov s5, r0
+; CHECK-FIX-NOSCHED-NEXT: vmov s6, r0
; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16
-; CHECK-FIX-NOSCHED-NEXT: vmov s13, r2
-; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0
+; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s7, r0
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4
+; CHECK-FIX-NOSCHED-NEXT: vmov s13, r3
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT: vmov s5, r7
+; CHECK-FIX-NOSCHED-NEXT: lsr r0, r7, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s5
+; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5
+; CHECK-FIX-NOSCHED-NEXT: .LBB37_5:
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s1
+; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0
; CHECK-FIX-NOSCHED-NEXT: vmov r0, s1
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s3
; CHECK-FIX-NOSCHED-NEXT: vmov r2, s1
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s15, s15
-; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3
-; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s15
-; CHECK-FIX-NOSCHED-NEXT: vmov s7, r3
-; CHECK-FIX-NOSCHED-NEXT: vmov r3, s1
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s11, s11
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT: vmov r3, s1
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s9
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s6, s6
-; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7
-; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4
-; CHECK-FIX-NOSCHED-NEXT: vmov s5, r7
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s4, s4
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r2, lsl #16
; CHECK-FIX-NOSCHED-NEXT: vmov r2, s11
; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0
@@ -1822,129 +1896,153 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_val(i1 zeroext %0, half %1, <1
; CHECK-CORTEX-FIX: @ %bb.0:
; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r11, lr}
; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-CORTEX-FIX-NEXT: .vsave {d8}
-; CHECK-CORTEX-FIX-NEXT: vpush {d8}
+; CHECK-CORTEX-FIX-NEXT: .vsave {d8, d9}
+; CHECK-CORTEX-FIX-NEXT: vpush {d8, d9}
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s0
; CHECK-CORTEX-FIX-NEXT: cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT: beq .LBB37_2
+; CHECK-CORTEX-FIX-NEXT: beq .LBB37_3
; CHECK-CORTEX-FIX-NEXT: @ %bb.1:
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s9
; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1]
; CHECK-CORTEX-FIX-NEXT: vmov r2, s0
+; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1]
; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r2
; CHECK-CORTEX-FIX-NEXT: vmov r4, r5, d17
+; CHECK-CORTEX-FIX-NEXT: lsr lr, r3, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s8, r3
+; CHECK-CORTEX-FIX-NEXT: vmov s11, lr
; CHECK-CORTEX-FIX-NEXT: lsr r6, r4, #16
; CHECK-CORTEX-FIX-NEXT: lsr r7, r5, #16
; CHECK-CORTEX-FIX-NEXT: vmov s0, r5
; CHECK-CORTEX-FIX-NEXT: vmov s2, r4
-; CHECK-CORTEX-FIX-NEXT: vmov s14, r7
-; CHECK-CORTEX-FIX-NEXT: vmov s3, r6
-; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d16
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s0
+; CHECK-CORTEX-FIX-NEXT: vmov s12, r7
+; CHECK-CORTEX-FIX-NEXT: vmov s1, r6
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s8
+; CHECK-CORTEX-FIX-NEXT: vmov.32 r2, d16[0]
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s0
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s2
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s14
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s3
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s1
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s11
; CHECK-CORTEX-FIX-NEXT: lsr r12, r2, #16
-; CHECK-CORTEX-FIX-NEXT: lsr lr, r3, #16
-; CHECK-CORTEX-FIX-NEXT: vmov s8, r3
-; CHECK-CORTEX-FIX-NEXT: vmov s10, r2
-; CHECK-CORTEX-FIX-NEXT: vmov s11, lr
-; CHECK-CORTEX-FIX-NEXT: vmov s13, r12
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s8
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s10
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s11
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s13
+; CHECK-CORTEX-FIX-NEXT: vmov s13, r2
+; CHECK-CORTEX-FIX-NEXT: vmov s15, r12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s13
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s15
; CHECK-CORTEX-FIX-NEXT: cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT: bne .LBB37_3
-; CHECK-CORTEX-FIX-NEXT: b .LBB37_4
+; CHECK-CORTEX-FIX-NEXT: bne .LBB37_4
; CHECK-CORTEX-FIX-NEXT: .LBB37_2:
-; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r1]
-; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r1, #2]
-; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r1, #4]
-; CHECK-CORTEX-FIX-NEXT: ldrh r6, [r1, #6]
-; CHECK-CORTEX-FIX-NEXT: ldrh r5, [r1, #8]
-; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r1, #10]
-; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #12]
-; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #14]
-; CHECK-CORTEX-FIX-NEXT: vmov s0, r3
-; CHECK-CORTEX-FIX-NEXT: vmov s2, r2
+; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d3
+; CHECK-CORTEX-FIX-NEXT: vmov r0, r2, d2
+; CHECK-CORTEX-FIX-NEXT: lsr r7, r2, #16
+; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16
+; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s6, r6
+; CHECK-CORTEX-FIX-NEXT: vmov s4, r5
+; CHECK-CORTEX-FIX-NEXT: vmov s7, r2
+; CHECK-CORTEX-FIX-NEXT: vmov s9, r0
+; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT: vmov s15, r4
+; CHECK-CORTEX-FIX-NEXT: vmov s16, r7
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s4
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s7
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s9
+; CHECK-CORTEX-FIX-NEXT: vmov s4, r12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s6
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16
+; CHECK-CORTEX-FIX-NEXT: b .LBB37_5
+; CHECK-CORTEX-FIX-NEXT: .LBB37_3:
+; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r1]
+; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r1, #2]
+; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r1, #4]
+; CHECK-CORTEX-FIX-NEXT: ldrh r6, [r1, #6]
+; CHECK-CORTEX-FIX-NEXT: ldrh r5, [r1, #8]
+; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r1, #10]
+; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #12]
+; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #14]
; CHECK-CORTEX-FIX-NEXT: vmov s1, r6
+; CHECK-CORTEX-FIX-NEXT: vmov s3, r7
+; CHECK-CORTEX-FIX-NEXT: vmov s0, r3
+; CHECK-CORTEX-FIX-NEXT: vmov s2, r2
; CHECK-CORTEX-FIX-NEXT: vmov s8, r4
-; CHECK-CORTEX-FIX-NEXT: vmov s10, r5
-; CHECK-CORTEX-FIX-NEXT: vmov s11, r7
-; CHECK-CORTEX-FIX-NEXT: vmov s13, lr
-; CHECK-CORTEX-FIX-NEXT: vmov s15, r12
+; CHECK-CORTEX-FIX-NEXT: vmov s12, r5
+; CHECK-CORTEX-FIX-NEXT: vmov s11, lr
+; CHECK-CORTEX-FIX-NEXT: vmov s13, r12
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s0
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s2
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s2
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s8
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s10
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s1
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s11
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s13
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s15
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s1
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s3
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s11
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s13
; CHECK-CORTEX-FIX-NEXT: cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT: beq .LBB37_4
-; CHECK-CORTEX-FIX-NEXT: .LBB37_3:
+; CHECK-CORTEX-FIX-NEXT: beq .LBB37_2
+; CHECK-CORTEX-FIX-NEXT: .LBB37_4:
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9
+; CHECK-CORTEX-FIX-NEXT: vmov.32 r2, d2[1]
; CHECK-CORTEX-FIX-NEXT: vmov r0, s9
+; CHECK-CORTEX-FIX-NEXT: lsr r7, r2, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s16, r7
; CHECK-CORTEX-FIX-NEXT: vmov.16 d2[0], r0
-; CHECK-CORTEX-FIX-NEXT: .LBB37_4:
; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d3
-; CHECK-CORTEX-FIX-NEXT: vmov r0, r2, d2
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s1
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10
-; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16
-; CHECK-CORTEX-FIX-NEXT: vmov s4, r5
-; CHECK-CORTEX-FIX-NEXT: lsr r7, r2, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s7, r2
; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16
-; CHECK-CORTEX-FIX-NEXT: vmov s5, r2
-; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16
-; CHECK-CORTEX-FIX-NEXT: vmov s9, r0
-; CHECK-CORTEX-FIX-NEXT: vmov r0, s12
-; CHECK-CORTEX-FIX-NEXT: vmov r2, s14
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s3
+; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16
; CHECK-CORTEX-FIX-NEXT: vmov s6, r6
-; CHECK-CORTEX-FIX-NEXT: vmov r5, s2
-; CHECK-CORTEX-FIX-NEXT: vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s7
+; CHECK-CORTEX-FIX-NEXT: vmov s9, r3
; CHECK-CORTEX-FIX-NEXT: vmov s15, r4
-; CHECK-CORTEX-FIX-NEXT: vmov s16, r7
-; CHECK-CORTEX-FIX-NEXT: vmov r4, s10
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s4
-; CHECK-CORTEX-FIX-NEXT: vmov s4, r12
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s5
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9
-; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r2, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r2, s1
-; CHECK-CORTEX-FIX-NEXT: vmov r3, s12
+; CHECK-CORTEX-FIX-NEXT: vmov.32 r0, d2[0]
+; CHECK-CORTEX-FIX-NEXT: vmov s4, r5
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s6
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s9
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s7
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s11
-; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r2, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s4
+; CHECK-CORTEX-FIX-NEXT: vmov s18, r0
+; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s4, r12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s18
+; CHECK-CORTEX-FIX-NEXT: .LBB37_5:
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s3
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s4
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s5
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s13
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9
-; CHECK-CORTEX-FIX-NEXT: vmov r3, s14
-; CHECK-CORTEX-FIX-NEXT: vmov r7, s3
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s7
+; CHECK-CORTEX-FIX-NEXT: vmov r0, s10
+; CHECK-CORTEX-FIX-NEXT: vmov r2, s14
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s1
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s5
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s13
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s11
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15
+; CHECK-CORTEX-FIX-NEXT: vmov r5, s2
+; CHECK-CORTEX-FIX-NEXT: vmov r4, s8
+; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r2, lsl #16
+; CHECK-CORTEX-FIX-NEXT: vmov r2, s3
+; CHECK-CORTEX-FIX-NEXT: vmov r3, s10
+; CHECK-CORTEX-FIX-NEXT: vmov r7, s1
; CHECK-CORTEX-FIX-NEXT: vmov r6, s11
; CHECK-CORTEX-FIX-NEXT: vmov r0, s9
+; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r2, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT: vmov r3, s14
; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r7, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r7, s7
+; CHECK-CORTEX-FIX-NEXT: vmov r7, s5
; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r7, r6, lsl #16
; CHECK-CORTEX-FIX-NEXT: vmov r6, s0
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s4
; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r5, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r5, s8
+; CHECK-CORTEX-FIX-NEXT: vmov r5, s12
; CHECK-CORTEX-FIX-NEXT: vmov r2, s0
; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r4, lsl #16
; CHECK-CORTEX-FIX-NEXT: vmov r4, s6
@@ -1953,7 +2051,7 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_val(i1 zeroext %0, half %1, <1
; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], lr
; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r12
; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r4, r0, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r4, s5
+; CHECK-CORTEX-FIX-NEXT: vmov r4, s7
; CHECK-CORTEX-FIX-NEXT: pkhbt r2, r4, r2, lsl #16
; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r2
; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r0
@@ -1962,7 +2060,7 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_val(i1 zeroext %0, half %1, <1
; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8
; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9
; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1]
-; CHECK-CORTEX-FIX-NEXT: vpop {d8}
+; CHECK-CORTEX-FIX-NEXT: vpop {d8, d9}
; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r11, pc}
br i1 %0, label %5, label %11
@@ -3726,54 +3824,77 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
; CHECK-FIX-NOSCHED-NEXT: .save {r4, r5, r6, r7, r8, lr}
; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r8, lr}
; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_2
+; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_3
; CHECK-FIX-NOSCHED-NEXT: @ %bb.1:
; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2]
-; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16]
-; CHECK-FIX-NOSCHED-NEXT: vmov r3, r4, d17
-; CHECK-FIX-NOSCHED-NEXT: vmov lr, r12, d16
-; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3
-; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT: vorr q9, q8, q8
+; CHECK-FIX-NOSCHED-NEXT: vmov lr, r12, d17
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d16[1]
+; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d18[0]}, [r1:16]
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r4, d18[0]
+; CHECK-FIX-NOSCHED-NEXT: vmov s6, lr
+; CHECK-FIX-NOSCHED-NEXT: lsr r5, r3, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s8, s6
-; CHECK-FIX-NOSCHED-NEXT: vmov s4, r4
-; CHECK-FIX-NOSCHED-NEXT: lsr r4, r4, #16
-; CHECK-FIX-NOSCHED-NEXT: vmov s10, r3
-; CHECK-FIX-NOSCHED-NEXT: vmov s6, r12
-; CHECK-FIX-NOSCHED-NEXT: lsr r12, r12, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s6
-; CHECK-FIX-NOSCHED-NEXT: vmov s6, lr
+; CHECK-FIX-NOSCHED-NEXT: lsr r3, r12, #16
; CHECK-FIX-NOSCHED-NEXT: lsr lr, lr, #16
-; CHECK-FIX-NOSCHED-NEXT: vmov s14, r12
-; CHECK-FIX-NOSCHED-NEXT: vmov s7, lr
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s6
-; CHECK-FIX-NOSCHED-NEXT: vmov s6, r4
+; CHECK-FIX-NOSCHED-NEXT: vmov s4, r12
+; CHECK-FIX-NOSCHED-NEXT: vmov s10, lr
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT: vmov s14, r5
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s10
+; CHECK-FIX-NOSCHED-NEXT: vmov s6, r4
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s14
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s6
+; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3
+; CHECK-FIX-NOSCHED-NEXT: lsr r3, r4, #16
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT: vmov s7, r3
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7
; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT: bne .LBB82_3
-; CHECK-FIX-NOSCHED-NEXT: b .LBB82_4
+; CHECK-FIX-NOSCHED-NEXT: bne .LBB82_4
; CHECK-FIX-NOSCHED-NEXT: .LBB82_2:
-; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r2, #10]
-; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r2, #6]
+; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d0
+; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d1
+; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1
+; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s1
+; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3
+; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s1, r0
+; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s3, r3
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s1
+; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FIX-NOSCHED-NEXT: vmov s13, r1
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2
+; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT: b .LBB82_5
+; CHECK-FIX-NOSCHED-NEXT: .LBB82_3:
+; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r2, #10]
+; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r2, #6]
; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r2, #2]
; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r2, #14]
-; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4
+; CHECK-FIX-NOSCHED-NEXT: vmov s8, r5
; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #12]
-; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr
-; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r2, #8]
+; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12
+; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r2, #8]
; CHECK-FIX-NOSCHED-NEXT: vmov s5, r6
-; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r2, #4]
+; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r2, #4]
; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7
; CHECK-FIX-NOSCHED-NEXT: ldrh r8, [r2]
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s4
; CHECK-FIX-NOSCHED-NEXT: vmov s4, r3
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s8
-; CHECK-FIX-NOSCHED-NEXT: vmov s8, r12
+; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s12
-; CHECK-FIX-NOSCHED-NEXT: vmov s12, r5
+; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s5
; CHECK-FIX-NOSCHED-NEXT: vmov s5, r8
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4
@@ -3781,44 +3902,46 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s12
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5
; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_4
-; CHECK-FIX-NOSCHED-NEXT: .LBB82_3:
-; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d0[0]}, [r1:16]
+; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_2
; CHECK-FIX-NOSCHED-NEXT: .LBB82_4:
-; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d0
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s5
-; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d1
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8
-; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1
-; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16
+; CHECK-FIX-NOSCHED-NEXT: vorr q8, q0, q0
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d0[1]
+; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16]
+; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d1
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r7, d16[0]
+; CHECK-FIX-NOSCHED-NEXT: vmov s1, r3
+; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s1
-; CHECK-FIX-NOSCHED-NEXT: vmov s1, r0
+; CHECK-FIX-NOSCHED-NEXT: vmov s13, r3
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT: vmov s0, r1
+; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s2, r0
; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16
-; CHECK-FIX-NOSCHED-NEXT: vmov s13, r1
-; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0
+; CHECK-FIX-NOSCHED-NEXT: vmov s3, r0
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7
+; CHECK-FIX-NOSCHED-NEXT: lsr r0, r7, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s1
+; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1
+; CHECK-FIX-NOSCHED-NEXT: .LBB82_5:
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s5
+; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0
; CHECK-FIX-NOSCHED-NEXT: vmov r0, s5
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s7
; CHECK-FIX-NOSCHED-NEXT: vmov r1, s5
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s15, s15
-; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3
-; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s15
-; CHECK-FIX-NOSCHED-NEXT: vmov s3, r3
-; CHECK-FIX-NOSCHED-NEXT: vmov r3, s5
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s11, s11
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT: vmov r3, s5
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s9
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s2, s2
-; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7
-; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1
; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r1, lsl #16
; CHECK-FIX-NOSCHED-NEXT: vmov r1, s11
; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0
@@ -3864,39 +3987,63 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
; CHECK-CORTEX-FIX: @ %bb.0:
; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r8, lr}
; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r8, lr}
-; CHECK-CORTEX-FIX-NEXT: .vsave {d8}
-; CHECK-CORTEX-FIX-NEXT: vpush {d8}
+; CHECK-CORTEX-FIX-NEXT: .vsave {d8, d9}
+; CHECK-CORTEX-FIX-NEXT: vpush {d8, d9}
; CHECK-CORTEX-FIX-NEXT: cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT: beq .LBB82_2
+; CHECK-CORTEX-FIX-NEXT: beq .LBB82_3
; CHECK-CORTEX-FIX-NEXT: @ %bb.1:
; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2]
-; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16]
+; CHECK-CORTEX-FIX-NEXT: vorr q9, q8, q8
+; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1]
; CHECK-CORTEX-FIX-NEXT: vmov r5, r6, d17
+; CHECK-CORTEX-FIX-NEXT: vld1.16 {d18[0]}, [r1:16]
; CHECK-CORTEX-FIX-NEXT: lsr r7, r5, #16
; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16
; CHECK-CORTEX-FIX-NEXT: vmov s4, r6
; CHECK-CORTEX-FIX-NEXT: vmov s6, r5
-; CHECK-CORTEX-FIX-NEXT: vmov s14, r4
-; CHECK-CORTEX-FIX-NEXT: vmov s7, r7
-; CHECK-CORTEX-FIX-NEXT: vmov r12, r3, d16
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s4
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s6
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s14
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s7
-; CHECK-CORTEX-FIX-NEXT: lsr lr, r12, #16
; CHECK-CORTEX-FIX-NEXT: lsr r8, r3, #16
; CHECK-CORTEX-FIX-NEXT: vmov s8, r3
-; CHECK-CORTEX-FIX-NEXT: vmov s10, r12
+; CHECK-CORTEX-FIX-NEXT: vmov s12, r4
+; CHECK-CORTEX-FIX-NEXT: vmov s5, r7
; CHECK-CORTEX-FIX-NEXT: vmov s9, r8
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s4
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s6
+; CHECK-CORTEX-FIX-NEXT: vmov.32 lr, d18[0]
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s5
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s8
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9
+; CHECK-CORTEX-FIX-NEXT: lsr r12, lr, #16
; CHECK-CORTEX-FIX-NEXT: vmov s11, lr
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s8
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s10
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s9
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s11
+; CHECK-CORTEX-FIX-NEXT: vmov s13, r12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s11
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s13
; CHECK-CORTEX-FIX-NEXT: cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT: bne .LBB82_3
-; CHECK-CORTEX-FIX-NEXT: b .LBB82_4
+; CHECK-CORTEX-FIX-NEXT: bne .LBB82_4
; CHECK-CORTEX-FIX-NEXT: .LBB82_2:
+; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d1
+; CHECK-CORTEX-FIX-NEXT: vmov r0, r1, d0
+; CHECK-CORTEX-FIX-NEXT: lsr r7, r1, #16
+; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16
+; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s2, r6
+; CHECK-CORTEX-FIX-NEXT: vmov s0, r5
+; CHECK-CORTEX-FIX-NEXT: vmov s3, r1
+; CHECK-CORTEX-FIX-NEXT: vmov s9, r0
+; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT: vmov s15, r4
+; CHECK-CORTEX-FIX-NEXT: vmov s16, r7
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s0
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s3
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s9
+; CHECK-CORTEX-FIX-NEXT: vmov s0, r12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s2
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16
+; CHECK-CORTEX-FIX-NEXT: b .LBB82_5
+; CHECK-CORTEX-FIX-NEXT: .LBB82_3:
; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r2]
; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r2, #2]
; CHECK-CORTEX-FIX-NEXT: ldrh r8, [r2, #4]
@@ -3905,84 +4052,86 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #10]
; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r2, #12]
; CHECK-CORTEX-FIX-NEXT: ldrh r6, [r2, #14]
+; CHECK-CORTEX-FIX-NEXT: vmov s5, r5
+; CHECK-CORTEX-FIX-NEXT: vmov s7, r8
; CHECK-CORTEX-FIX-NEXT: vmov s4, r6
; CHECK-CORTEX-FIX-NEXT: vmov s6, r7
-; CHECK-CORTEX-FIX-NEXT: vmov s5, r5
; CHECK-CORTEX-FIX-NEXT: vmov s8, r3
-; CHECK-CORTEX-FIX-NEXT: vmov s10, r4
-; CHECK-CORTEX-FIX-NEXT: vmov s9, r8
-; CHECK-CORTEX-FIX-NEXT: vmov s11, lr
-; CHECK-CORTEX-FIX-NEXT: vmov s13, r12
+; CHECK-CORTEX-FIX-NEXT: vmov s12, r4
+; CHECK-CORTEX-FIX-NEXT: vmov s9, lr
+; CHECK-CORTEX-FIX-NEXT: vmov s11, r12
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s4
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s6
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s6
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s8
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s10
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s5
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s11
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s13
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s5
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s7
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s9
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s11
; CHECK-CORTEX-FIX-NEXT: cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT: beq .LBB82_4
-; CHECK-CORTEX-FIX-NEXT: .LBB82_3:
-; CHECK-CORTEX-FIX-NEXT: vld1.16 {d0[0]}, [r1:16]
+; CHECK-CORTEX-FIX-NEXT: beq .LBB82_2
; CHECK-CORTEX-FIX-NEXT: .LBB82_4:
+; CHECK-CORTEX-FIX-NEXT: vorr q8, q0, q0
+; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d0[1]
; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d1
-; CHECK-CORTEX-FIX-NEXT: vmov r0, r1, d0
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s5
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s4, s4
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10
-; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16
-; CHECK-CORTEX-FIX-NEXT: vmov s0, r5
-; CHECK-CORTEX-FIX-NEXT: lsr r7, r1, #16
+; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16]
; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16
-; CHECK-CORTEX-FIX-NEXT: vmov s1, r1
-; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16
-; CHECK-CORTEX-FIX-NEXT: vmov s9, r0
-; CHECK-CORTEX-FIX-NEXT: vmov r0, s12
-; CHECK-CORTEX-FIX-NEXT: vmov r1, s14
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s7
+; CHECK-CORTEX-FIX-NEXT: lsr r1, r5, #16
; CHECK-CORTEX-FIX-NEXT: vmov s2, r6
-; CHECK-CORTEX-FIX-NEXT: vmov r5, s6
-; CHECK-CORTEX-FIX-NEXT: vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT: vmov s0, r5
+; CHECK-CORTEX-FIX-NEXT: lsr r7, r3, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s3, r3
+; CHECK-CORTEX-FIX-NEXT: vmov s9, r1
; CHECK-CORTEX-FIX-NEXT: vmov s15, r4
; CHECK-CORTEX-FIX-NEXT: vmov s16, r7
-; CHECK-CORTEX-FIX-NEXT: vmov r4, s10
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s0
-; CHECK-CORTEX-FIX-NEXT: vmov s0, r12
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s1
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s9
-; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r1, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r1, s5
-; CHECK-CORTEX-FIX-NEXT: vmov r3, s12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s0
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s2
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT: vmov.32 r0, d16[0]
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s9
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s3
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s3
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s11
-; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r1, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT: vmov s18, r0
+; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s0, r12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s18
+; CHECK-CORTEX-FIX-NEXT: .LBB82_5:
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s7
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s4, s4
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s1
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s13
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9
-; CHECK-CORTEX-FIX-NEXT: vmov r3, s14
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s3
+; CHECK-CORTEX-FIX-NEXT: vmov r0, s10
+; CHECK-CORTEX-FIX-NEXT: vmov r1, s14
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s5
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s1
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s13
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s11
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15
+; CHECK-CORTEX-FIX-NEXT: vmov r5, s6
+; CHECK-CORTEX-FIX-NEXT: vmov r4, s8
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0
-; CHECK-CORTEX-FIX-NEXT: vmov r7, s3
+; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r1, lsl #16
+; CHECK-CORTEX-FIX-NEXT: vmov r1, s7
+; CHECK-CORTEX-FIX-NEXT: vmov r3, s10
+; CHECK-CORTEX-FIX-NEXT: vmov r7, s1
; CHECK-CORTEX-FIX-NEXT: vmov r6, s11
; CHECK-CORTEX-FIX-NEXT: vmov r0, s9
+; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r1, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT: vmov r3, s14
; CHECK-CORTEX-FIX-NEXT: vmov r1, s0
; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r7, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r7, s7
+; CHECK-CORTEX-FIX-NEXT: vmov r7, s5
; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r7, r6, lsl #16
; CHECK-CORTEX-FIX-NEXT: vmov r6, s4
; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r5, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r5, s8
+; CHECK-CORTEX-FIX-NEXT: vmov r5, s12
; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r4, lsl #16
; CHECK-CORTEX-FIX-NEXT: vmov r4, s2
; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r5
@@ -3990,7 +4139,7 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], lr
; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r12
; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r4, r0, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r4, s1
+; CHECK-CORTEX-FIX-NEXT: vmov r4, s3
; CHECK-CORTEX-FIX-NEXT: pkhbt r1, r4, r1, lsl #16
; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r1
; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r0
@@ -3999,7 +4148,7 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8
; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9
; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2]
-; CHECK-CORTEX-FIX-NEXT: vpop {d8}
+; CHECK-CORTEX-FIX-NEXT: vpop {d8, d9}
; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r8, pc}
br i1 %0, label %5, label %12
@@ -4050,56 +4199,78 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_val(i1 zeroext %0, half %1, <1
; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r11, lr}
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s0
; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_2
+; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_3
; CHECK-FIX-NOSCHED-NEXT: @ %bb.1:
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s9
; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1]
; CHECK-FIX-NOSCHED-NEXT: vmov r2, s0
+; CHECK-FIX-NOSCHED-NEXT: vmov lr, r12, d17
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d16[1]
; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r2
-; CHECK-FIX-NOSCHED-NEXT: vmov r3, lr, d17
-; CHECK-FIX-NOSCHED-NEXT: vmov r2, r12, d16
-; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3
-; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s2, lr
+; CHECK-FIX-NOSCHED-NEXT: lsr lr, lr, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s8, s2
-; CHECK-FIX-NOSCHED-NEXT: lsr r4, lr, #16
-; CHECK-FIX-NOSCHED-NEXT: vmov s0, lr
-; CHECK-FIX-NOSCHED-NEXT: vmov s2, r12
-; CHECK-FIX-NOSCHED-NEXT: lsr r5, r2, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r2, d16[0]
+; CHECK-FIX-NOSCHED-NEXT: lsr r4, r3, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s2
-; CHECK-FIX-NOSCHED-NEXT: vmov s2, r2
-; CHECK-FIX-NOSCHED-NEXT: lsr r2, r12, #16
-; CHECK-FIX-NOSCHED-NEXT: vmov s10, r3
-; CHECK-FIX-NOSCHED-NEXT: vmov s14, r2
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s2
-; CHECK-FIX-NOSCHED-NEXT: vmov s2, r4
+; CHECK-FIX-NOSCHED-NEXT: lsr r3, r12, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s0, r12
+; CHECK-FIX-NOSCHED-NEXT: vmov s10, lr
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-FIX-NOSCHED-NEXT: vmov s3, r5
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2
+; CHECK-FIX-NOSCHED-NEXT: vmov s14, r4
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s10
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s14
+; CHECK-FIX-NOSCHED-NEXT: vmov s2, r2
+; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s3, r2
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s2
+; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT: bne .LBB83_3
-; CHECK-FIX-NOSCHED-NEXT: b .LBB83_4
+; CHECK-FIX-NOSCHED-NEXT: bne .LBB83_4
; CHECK-FIX-NOSCHED-NEXT: .LBB83_2:
+; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d2
+; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d3
+; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2
+; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s5
+; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3
+; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s5, r0
+; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s7, r3
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s5
+; CHECK-FIX-NOSCHED-NEXT: vmov s5, r7
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4
+; CHECK-FIX-NOSCHED-NEXT: vmov s13, r2
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT: b .LBB83_5
+; CHECK-FIX-NOSCHED-NEXT: .LBB83_3:
; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #10]
-; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r1, #6]
-; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r1, #2]
+; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r1, #6]
+; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r1, #2]
; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r1, #14]
; CHECK-FIX-NOSCHED-NEXT: vmov s8, r3
; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #12]
-; CHECK-FIX-NOSCHED-NEXT: vmov s12, r4
-; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r1, #8]
-; CHECK-FIX-NOSCHED-NEXT: vmov s1, lr
-; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r1, #4]
+; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12
+; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r1, #8]
+; CHECK-FIX-NOSCHED-NEXT: vmov s1, r5
+; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r1, #4]
; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7
; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r1]
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s0
; CHECK-FIX-NOSCHED-NEXT: vmov s0, r2
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s8
-; CHECK-FIX-NOSCHED-NEXT: vmov s8, r5
+; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s12
-; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12
+; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s1
; CHECK-FIX-NOSCHED-NEXT: vmov s1, r6
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0
@@ -4107,47 +4278,48 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_val(i1 zeroext %0, half %1, <1
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s12
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1
; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_4
-; CHECK-FIX-NOSCHED-NEXT: .LBB83_3:
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s9, s9
-; CHECK-FIX-NOSCHED-NEXT: vmov r0, s9
-; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r0
+; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_2
; CHECK-FIX-NOSCHED-NEXT: .LBB83_4:
-; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d2
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s1
-; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d3
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0
-; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2
-; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s9, s9
+; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d3
+; CHECK-FIX-NOSCHED-NEXT: vmov r7, s9
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d2[1]
+; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r7
+; CHECK-FIX-NOSCHED-NEXT: vmov.32 r7, d2[0]
+; CHECK-FIX-NOSCHED-NEXT: vmov s5, r3
+; CHECK-FIX-NOSCHED-NEXT: vmov s4, r2
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s5
-; CHECK-FIX-NOSCHED-NEXT: vmov s5, r0
+; CHECK-FIX-NOSCHED-NEXT: vmov s6, r0
; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16
-; CHECK-FIX-NOSCHED-NEXT: vmov s13, r2
-; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0
+; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT: vmov s7, r0
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4
+; CHECK-FIX-NOSCHED-NEXT: vmov s13, r3
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT: vmov s5, r7
+; CHECK-FIX-NOSCHED-NEXT: lsr r0, r7, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s5
+; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5
+; CHECK-FIX-NOSCHED-NEXT: .LBB83_5:
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s1
+; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0
; CHECK-FIX-NOSCHED-NEXT: vmov r0, s1
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s3
; CHECK-FIX-NOSCHED-NEXT: vmov r2, s1
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s15, s15
-; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3
-; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s15
-; CHECK-FIX-NOSCHED-NEXT: vmov s7, r3
-; CHECK-FIX-NOSCHED-NEXT: vmov r3, s1
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s11, s11
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT: vmov r3, s1
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s9
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s6, s6
-; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7
-; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4
-; CHECK-FIX-NOSCHED-NEXT: vmov s5, r7
; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s4, s4
-; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5
+; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r2, lsl #16
; CHECK-FIX-NOSCHED-NEXT: vmov r2, s11
; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0
@@ -4192,42 +4364,65 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_val(i1 zeroext %0, half %1, <1
; CHECK-CORTEX-FIX: @ %bb.0:
; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r11, lr}
; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-CORTEX-FIX-NEXT: .vsave {d8}
-; CHECK-CORTEX-FIX-NEXT: vpush {d8}
+; CHECK-CORTEX-FIX-NEXT: .vsave {d8, d9}
+; CHECK-CORTEX-FIX-NEXT: vpush {d8, d9}
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s0
; CHECK-CORTEX-FIX-NEXT: cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT: beq .LBB83_2
+; CHECK-CORTEX-FIX-NEXT: beq .LBB83_3
; CHECK-CORTEX-FIX-NEXT: @ %bb.1:
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s9
; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1]
; CHECK-CORTEX-FIX-NEXT: vmov r2, s0
+; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1]
; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r2
; CHECK-CORTEX-FIX-NEXT: vmov r4, r5, d17
+; CHECK-CORTEX-FIX-NEXT: lsr lr, r3, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s8, r3
+; CHECK-CORTEX-FIX-NEXT: vmov s11, lr
; CHECK-CORTEX-FIX-NEXT: lsr r6, r4, #16
; CHECK-CORTEX-FIX-NEXT: lsr r7, r5, #16
; CHECK-CORTEX-FIX-NEXT: vmov s0, r5
; CHECK-CORTEX-FIX-NEXT: vmov s2, r4
-; CHECK-CORTEX-FIX-NEXT: vmov s14, r7
-; CHECK-CORTEX-FIX-NEXT: vmov s3, r6
-; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d16
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s0
+; CHECK-CORTEX-FIX-NEXT: vmov s12, r7
+; CHECK-CORTEX-FIX-NEXT: vmov s1, r6
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s8
+; CHECK-CORTEX-FIX-NEXT: vmov.32 r2, d16[0]
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s0
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s2
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s14
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s3
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s1
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s11
; CHECK-CORTEX-FIX-NEXT: lsr r12, r2, #16
-; CHECK-CORTEX-FIX-NEXT: lsr lr, r3, #16
-; CHECK-CORTEX-FIX-NEXT: vmov s8, r3
-; CHECK-CORTEX-FIX-NEXT: vmov s10, r2
-; CHECK-CORTEX-FIX-NEXT: vmov s11, lr
-; CHECK-CORTEX-FIX-NEXT: vmov s13, r12
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s8
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s10
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s11
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s13
+; CHECK-CORTEX-FIX-NEXT: vmov s13, r2
+; CHECK-CORTEX-FIX-NEXT: vmov s15, r12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s13
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s15
; CHECK-CORTEX-FIX-NEXT: cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT: bne .LBB83_3
-; CHECK-CORTEX-FIX-NEXT: b .LBB83_4
+; CHECK-CORTEX-FIX-NEXT: bne .LBB83_4
; CHECK-CORTEX-FIX-NEXT: .LBB83_2:
+; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d3
+; CHECK-CORTEX-FIX-NEXT: vmov r0, r2, d2
+; CHECK-CORTEX-FIX-NEXT: lsr r7, r2, #16
+; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16
+; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s6, r6
+; CHECK-CORTEX-FIX-NEXT: vmov s4, r5
+; CHECK-CORTEX-FIX-NEXT: vmov s7, r2
+; CHECK-CORTEX-FIX-NEXT: vmov s9, r0
+; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT: vmov s15, r4
+; CHECK-CORTEX-FIX-NEXT: vmov s16, r7
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s4
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s7
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s9
+; CHECK-CORTEX-FIX-NEXT: vmov s4, r12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s6
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16
+; CHECK-CORTEX-FIX-NEXT: b .LBB83_5
+; CHECK-CORTEX-FIX-NEXT: .LBB83_3:
; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r1]
; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r1, #2]
; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r1, #4]
@@ -4236,85 +4431,86 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_val(i1 zeroext %0, half %1, <1
; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r1, #10]
; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #12]
; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #14]
+; CHECK-CORTEX-FIX-NEXT: vmov s1, r6
+; CHECK-CORTEX-FIX-NEXT: vmov s3, r7
; CHECK-CORTEX-FIX-NEXT: vmov s0, r3
; CHECK-CORTEX-FIX-NEXT: vmov s2, r2
-; CHECK-CORTEX-FIX-NEXT: vmov s1, r6
; CHECK-CORTEX-FIX-NEXT: vmov s8, r4
-; CHECK-CORTEX-FIX-NEXT: vmov s10, r5
-; CHECK-CORTEX-FIX-NEXT: vmov s11, r7
-; CHECK-CORTEX-FIX-NEXT: vmov s13, lr
-; CHECK-CORTEX-FIX-NEXT: vmov s15, r12
+; CHECK-CORTEX-FIX-NEXT: vmov s12, r5
+; CHECK-CORTEX-FIX-NEXT: vmov s11, lr
+; CHECK-CORTEX-FIX-NEXT: vmov s13, r12
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s0
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s2
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s2
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s8
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s10
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s1
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s11
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s13
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s15
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s1
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s3
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s11
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s13
; CHECK-CORTEX-FIX-NEXT: cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT: beq .LBB83_4
-; CHECK-CORTEX-FIX-NEXT: .LBB83_3:
+; CHECK-CORTEX-FIX-NEXT: beq .LBB83_2
+; CHECK-CORTEX-FIX-NEXT: .LBB83_4:
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9
+; CHECK-CORTEX-FIX-NEXT: vmov.32 r2, d2[1]
; CHECK-CORTEX-FIX-NEXT: vmov r0, s9
+; CHECK-CORTEX-FIX-NEXT: lsr r7, r2, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s16, r7
; CHECK-CORTEX-FIX-NEXT: vmov.16 d2[0], r0
-; CHECK-CORTEX-FIX-NEXT: .LBB83_4:
; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d3
-; CHECK-CORTEX-FIX-NEXT: vmov r0, r2, d2
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s1
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10
-; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16
-; CHECK-CORTEX-FIX-NEXT: vmov s4, r5
-; CHECK-CORTEX-FIX-NEXT: lsr r7, r2, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s7, r2
; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16
-; CHECK-CORTEX-FIX-NEXT: vmov s5, r2
-; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16
-; CHECK-CORTEX-FIX-NEXT: vmov s9, r0
-; CHECK-CORTEX-FIX-NEXT: vmov r0, s12
-; CHECK-CORTEX-FIX-NEXT: vmov r2, s14
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s3
+; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16
; CHECK-CORTEX-FIX-NEXT: vmov s6, r6
-; CHECK-CORTEX-FIX-NEXT: vmov r5, s2
-; CHECK-CORTEX-FIX-NEXT: vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s7
+; CHECK-CORTEX-FIX-NEXT: vmov s9, r3
; CHECK-CORTEX-FIX-NEXT: vmov s15, r4
-; CHECK-CORTEX-FIX-NEXT: vmov s16, r7
-; CHECK-CORTEX-FIX-NEXT: vmov r4, s10
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s4
-; CHECK-CORTEX-FIX-NEXT: vmov s4, r12
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s5
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9
-; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r2, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r2, s1
-; CHECK-CORTEX-FIX-NEXT: vmov r3, s12
+; CHECK-CORTEX-FIX-NEXT: vmov.32 r0, d2[0]
+; CHECK-CORTEX-FIX-NEXT: vmov s4, r5
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s6
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s9
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s7
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s11
-; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r2, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s4
+; CHECK-CORTEX-FIX-NEXT: vmov s18, r0
+; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT: vmov s4, r12
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s18
+; CHECK-CORTEX-FIX-NEXT: .LBB83_5:
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s3
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12
; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s4
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s5
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s13
-; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9
-; CHECK-CORTEX-FIX-NEXT: vmov r3, s14
-; CHECK-CORTEX-FIX-NEXT: vmov r7, s3
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s7
+; CHECK-CORTEX-FIX-NEXT: vmov r0, s10
+; CHECK-CORTEX-FIX-NEXT: vmov r2, s14
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s1
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s5
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s13
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s11
+; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15
+; CHECK-CORTEX-FIX-NEXT: vmov r5, s2
+; CHECK-CORTEX-FIX-NEXT: vmov r4, s8
+; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r2, lsl #16
+; CHECK-CORTEX-FIX-NEXT: vmov r2, s3
+; CHECK-CORTEX-FIX-NEXT: vmov r3, s10
+; CHECK-CORTEX-FIX-NEXT: vmov r7, s1
; CHECK-CORTEX-FIX-NEXT: vmov r6, s11
; CHECK-CORTEX-FIX-NEXT: vmov r0, s9
+; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r2, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT: vmov r3, s14
; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r7, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r7, s7
+; CHECK-CORTEX-FIX-NEXT: vmov r7, s5
; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r7, r6, lsl #16
; CHECK-CORTEX-FIX-NEXT: vmov r6, s0
; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s4
; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r5, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r5, s8
+; CHECK-CORTEX-FIX-NEXT: vmov r5, s12
; CHECK-CORTEX-FIX-NEXT: vmov r2, s0
; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r4, lsl #16
; CHECK-CORTEX-FIX-NEXT: vmov r4, s6
@@ -4323,7 +4519,7 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_val(i1 zeroext %0, half %1, <1
; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], lr
; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r12
; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r4, r0, lsl #16
-; CHECK-CORTEX-FIX-NEXT: vmov r4, s5
+; CHECK-CORTEX-FIX-NEXT: vmov r4, s7
; CHECK-CORTEX-FIX-NEXT: pkhbt r2, r4, r2, lsl #16
; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r2
; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r0
@@ -4332,7 +4528,7 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_val(i1 zeroext %0, half %1, <1
; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8
; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9
; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1]
-; CHECK-CORTEX-FIX-NEXT: vpop {d8}
+; CHECK-CORTEX-FIX-NEXT: vpop {d8, d9}
; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r11, pc}
br i1 %0, label %5, label %11
diff --git a/llvm/test/CodeGen/RISCV/rv32zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbp.ll
index 4293b9261f975..e3f824c214ade 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbp.ll
@@ -1110,29 +1110,28 @@ define i64 @gorc2b_i64(i64 %a) nounwind {
;
; RV32ZBP-LABEL: gorc2b_i64:
; RV32ZBP: # %bb.0:
-; RV32ZBP-NEXT: srli a2, a1, 2
-; RV32ZBP-NEXT: srli a3, a0, 2
-; RV32ZBP-NEXT: lui a4, 209715
-; RV32ZBP-NEXT: addi a4, a4, 819
-; RV32ZBP-NEXT: and a3, a3, a4
-; RV32ZBP-NEXT: or a3, a3, a0
-; RV32ZBP-NEXT: or a2, a2, a1
-; RV32ZBP-NEXT: orc2.n a1, a1
+; RV32ZBP-NEXT: srli a2, a0, 2
+; RV32ZBP-NEXT: srli a3, a1, 2
+; RV32ZBP-NEXT: or a3, a3, a1
+; RV32ZBP-NEXT: or a2, a2, a0
; RV32ZBP-NEXT: orc2.n a0, a0
+; RV32ZBP-NEXT: orc2.n a1, a1
; RV32ZBP-NEXT: slli a2, a2, 2
; RV32ZBP-NEXT: slli a3, a3, 2
-; RV32ZBP-NEXT: lui a5, 838861
-; RV32ZBP-NEXT: addi a5, a5, -820
-; RV32ZBP-NEXT: and a3, a3, a5
-; RV32ZBP-NEXT: and a2, a2, a5
+; RV32ZBP-NEXT: lui a4, 838861
+; RV32ZBP-NEXT: addi a4, a4, -820
+; RV32ZBP-NEXT: and a3, a3, a4
+; RV32ZBP-NEXT: and a2, a2, a4
+; RV32ZBP-NEXT: srli a4, a1, 2
; RV32ZBP-NEXT: srli a5, a0, 2
-; RV32ZBP-NEXT: srli a6, a1, 2
-; RV32ZBP-NEXT: and a6, a6, a4
-; RV32ZBP-NEXT: and a4, a5, a4
-; RV32ZBP-NEXT: or a0, a4, a0
-; RV32ZBP-NEXT: or a1, a6, a1
-; RV32ZBP-NEXT: or a1, a1, a2
-; RV32ZBP-NEXT: or a0, a0, a3
+; RV32ZBP-NEXT: lui a6, 209715
+; RV32ZBP-NEXT: addi a6, a6, 819
+; RV32ZBP-NEXT: and a5, a5, a6
+; RV32ZBP-NEXT: and a4, a4, a6
+; RV32ZBP-NEXT: or a1, a4, a1
+; RV32ZBP-NEXT: or a0, a5, a0
+; RV32ZBP-NEXT: or a0, a0, a2
+; RV32ZBP-NEXT: or a1, a1, a3
; RV32ZBP-NEXT: ret
%and1 = shl i64 %a, 2
%shl1 = and i64 %and1, -3689348814741910324
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
index 120105cfd14c7..0eb057a3c5bd4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
@@ -6,55 +6,55 @@ define arm_aapcs_vfpcc <4 x i32> @loads_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
-; CHECK-NEXT: .vsave {d9}
-; CHECK-NEXT: vpush {d9}
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: vmov.i64 q0, #0xffffffff
-; CHECK-NEXT: vmov.f32 s8, s6
-; CHECK-NEXT: vmov.f32 s10, s7
-; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vand q3, q2, q0
-; CHECK-NEXT: vand q0, q1, q0
-; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vmov r3, lr, d0
+; CHECK-NEXT: vmov.i64 q2, #0xffffffff
; CHECK-NEXT: vmov.f32 s0, s6
-; CHECK-NEXT: vmov r4, r1, d6
-; CHECK-NEXT: vmov r0, r12, d7
-; CHECK-NEXT: vldrw.u32 q3, [r2]
-; CHECK-NEXT: vmov.f32 s10, s7
-; CHECK-NEXT: vmov.f32 s8, s14
-; CHECK-NEXT: vmov.f32 s18, s15
-; CHECK-NEXT: vmov.f32 s14, s5
-; CHECK-NEXT: vmov r5, s0
-; CHECK-NEXT: vmov.f32 s0, s12
-; CHECK-NEXT: vmov.f32 s6, s13
-; CHECK-NEXT: adds r2, r5, r4
-; CHECK-NEXT: vmov r4, s8
-; CHECK-NEXT: asr.w r6, r5, #31
-; CHECK-NEXT: adcs r1, r6
-; CHECK-NEXT: asrl r2, r1, r4
-; CHECK-NEXT: vmov r1, s4
-; CHECK-NEXT: adds r6, r1, r3
-; CHECK-NEXT: vmov r3, s0
-; CHECK-NEXT: asr.w r4, r1, #31
-; CHECK-NEXT: adc.w r1, r4, lr
-; CHECK-NEXT: asrl r6, r1, r3
-; CHECK-NEXT: vmov r5, r4, d1
-; CHECK-NEXT: vmov r1, s10
-; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
-; CHECK-NEXT: adds r0, r0, r1
+; CHECK-NEXT: vmov.f32 s2, s7
+; CHECK-NEXT: vand q0, q0, q2
+; CHECK-NEXT: vmov.f32 s6, s5
+; CHECK-NEXT: vmov r4, r5, d0
+; CHECK-NEXT: vmov r3, r1, d1
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vmov.f32 s12, s2
+; CHECK-NEXT: vmov.f32 s2, s3
+; CHECK-NEXT: vmov r0, s12
+; CHECK-NEXT: vand q3, q1, q2
+; CHECK-NEXT: vldrw.u32 q1, [r2]
+; CHECK-NEXT: vmov lr, r12, d7
+; CHECK-NEXT: vmov.f32 s16, s6
+; CHECK-NEXT: vmov.f32 s18, s7
+; CHECK-NEXT: vand q2, q4, q2
+; CHECK-NEXT: asrs r2, r0, #31
+; CHECK-NEXT: adds r0, r0, r4
+; CHECK-NEXT: adcs r5, r2
+; CHECK-NEXT: vmov r2, s8
+; CHECK-NEXT: asrl r0, r5, r2
+; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov.f32 s2, s1
+; CHECK-NEXT: asrs r4, r2, #31
+; CHECK-NEXT: adds r2, r2, r3
+; CHECK-NEXT: adcs r1, r4
+; CHECK-NEXT: vmov r3, s10
+; CHECK-NEXT: asrl r2, r1, r3
+; CHECK-NEXT: vmov r4, r5, d6
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov.f32 s2, s5
+; CHECK-NEXT: adds.w r6, r1, lr
; CHECK-NEXT: asr.w r3, r1, #31
; CHECK-NEXT: adc.w r1, r3, r12
-; CHECK-NEXT: vmov r3, s18
-; CHECK-NEXT: asrl r0, r1, r3
-; CHECK-NEXT: vmov r1, s14
-; CHECK-NEXT: adds r6, r1, r5
-; CHECK-NEXT: asr.w r2, r1, #31
-; CHECK-NEXT: adc.w r1, r2, r4
-; CHECK-NEXT: vmov r2, s6
-; CHECK-NEXT: asrl r6, r1, r2
-; CHECK-NEXT: vmov q0[3], q0[1], r6, r0
-; CHECK-NEXT: vpop {d9}
+; CHECK-NEXT: vmov r3, s2
+; CHECK-NEXT: asrl r6, r1, r3
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: adds r4, r4, r1
+; CHECK-NEXT: asr.w r3, r1, #31
+; CHECK-NEXT: adc.w r1, r3, r5
+; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: asrl r4, r1, r3
+; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
+; CHECK-NEXT: vmov q0[3], q0[1], r6, r2
+; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
%a = load <4 x i32>, <4 x i32> *%A, align 4
@@ -142,56 +142,56 @@ define arm_aapcs_vfpcc void @load_store_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: .vsave {d9}
-; CHECK-NEXT: vpush {d9}
-; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: vmov.i64 q0, #0xffffffff
-; CHECK-NEXT: vmov.f32 s8, s6
-; CHECK-NEXT: vmov.f32 s10, s7
-; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vand q3, q2, q0
-; CHECK-NEXT: vand q1, q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: vmov r4, lr, d2
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vmov.i64 q4, #0xffffffff
; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov r5, r1, d6
-; CHECK-NEXT: vmov r0, r12, d7
-; CHECK-NEXT: vldrw.u32 q3, [r2]
-; CHECK-NEXT: vmov.f32 s10, s3
-; CHECK-NEXT: vmov.f32 s8, s14
-; CHECK-NEXT: vmov.f32 s18, s15
-; CHECK-NEXT: vmov.f32 s14, s1
-; CHECK-NEXT: vmov r6, s4
-; CHECK-NEXT: vmov.f32 s4, s12
-; CHECK-NEXT: vmov.f32 s2, s13
+; CHECK-NEXT: vmov.f32 s2, s1
+; CHECK-NEXT: vmov.f32 s6, s3
+; CHECK-NEXT: vand q2, q0, q4
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vand q1, q1, q4
+; CHECK-NEXT: vmov r5, r1, d3
+; CHECK-NEXT: vmov.f32 s12, s2
+; CHECK-NEXT: vmov.f32 s2, s3
+; CHECK-NEXT: vmov r0, r12, d2
+; CHECK-NEXT: vldrw.u32 q1, [r2]
+; CHECK-NEXT: vmov r4, lr, d5
+; CHECK-NEXT: vmov.f32 s20, s6
+; CHECK-NEXT: vmov.f32 s6, s1
+; CHECK-NEXT: vmov.f32 s22, s7
+; CHECK-NEXT: vand q4, q5, q4
+; CHECK-NEXT: vmov r6, s2
+; CHECK-NEXT: vmov.f32 s2, s5
; CHECK-NEXT: adds r2, r6, r5
-; CHECK-NEXT: vmov r5, s8
+; CHECK-NEXT: vmov r5, s18
; CHECK-NEXT: asr.w r7, r6, #31
; CHECK-NEXT: adcs r1, r7
; CHECK-NEXT: asrl r2, r1, r5
-; CHECK-NEXT: vmov r7, s4
-; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov r7, s2
+; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: adds r4, r4, r1
; CHECK-NEXT: asr.w r5, r1, #31
; CHECK-NEXT: adc.w r1, r5, lr
; CHECK-NEXT: asrl r4, r1, r7
-; CHECK-NEXT: vmov r6, r5, d3
-; CHECK-NEXT: vmov r1, s10
-; CHECK-NEXT: vmov q1[2], q1[0], r4, r2
+; CHECK-NEXT: vmov r6, r5, d4
+; CHECK-NEXT: vmov r1, s12
; CHECK-NEXT: adds r0, r0, r1
; CHECK-NEXT: asr.w r7, r1, #31
; CHECK-NEXT: adc.w r1, r7, r12
-; CHECK-NEXT: vmov r7, s18
+; CHECK-NEXT: vmov r7, s16
; CHECK-NEXT: asrl r0, r1, r7
-; CHECK-NEXT: vmov r1, s14
+; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: adds r6, r6, r1
-; CHECK-NEXT: asr.w r2, r1, #31
-; CHECK-NEXT: adc.w r1, r2, r5
-; CHECK-NEXT: vmov r2, s2
-; CHECK-NEXT: asrl r6, r1, r2
-; CHECK-NEXT: vmov q1[3], q1[1], r6, r0
-; CHECK-NEXT: vstrw.32 q1, [r3]
-; CHECK-NEXT: vpop {d9}
+; CHECK-NEXT: asr.w r7, r1, #31
+; CHECK-NEXT: adc.w r1, r7, r5
+; CHECK-NEXT: vmov r7, s4
+; CHECK-NEXT: asrl r6, r1, r7
+; CHECK-NEXT: vmov q0[2], q0[0], r6, r0
+; CHECK-NEXT: vmov q0[3], q0[1], r4, r2
+; CHECK-NEXT: vstrw.32 q0, [r3]
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
@@ -276,8 +276,8 @@ entry:
define arm_aapcs_vfpcc void @load_one_store_i32(<4 x i32> *%A, <4 x i32> *%D) {
; CHECK-LABEL: load_one_store_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vmov.f32 s4, s2
; CHECK-NEXT: vmov.f32 s2, s3
@@ -285,27 +285,27 @@ define arm_aapcs_vfpcc void @load_one_store_i32(<4 x i32> *%A, <4 x i32> *%D) {
; CHECK-NEXT: vmov.f32 s2, s1
; CHECK-NEXT: adds.w r12, r2, r2
; CHECK-NEXT: asr.w r3, r2, #31
-; CHECK-NEXT: adc.w r7, r3, r2, asr #31
-; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: asrl r12, r7, r2
-; CHECK-NEXT: adds r0, r3, r3
-; CHECK-NEXT: asr.w r5, r3, #31
-; CHECK-NEXT: adc.w r5, r5, r3, asr #31
-; CHECK-NEXT: asrl r0, r5, r3
-; CHECK-NEXT: vmov r3, s0
-; CHECK-NEXT: adds r4, r3, r3
-; CHECK-NEXT: asr.w r5, r3, #31
-; CHECK-NEXT: adc.w r5, r5, r3, asr #31
-; CHECK-NEXT: asrl r4, r5, r3
-; CHECK-NEXT: vmov q1[2], q1[0], r4, r0
-; CHECK-NEXT: vmov r0, s2
+; CHECK-NEXT: adc.w r3, r3, r2, asr #31
+; CHECK-NEXT: asrl r12, r3, r2
+; CHECK-NEXT: vmov r3, s2
+; CHECK-NEXT: adds r2, r3, r3
+; CHECK-NEXT: asr.w r0, r3, #31
+; CHECK-NEXT: adc.w r5, r0, r3, asr #31
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: asrl r2, r5, r3
; CHECK-NEXT: adds r4, r0, r0
-; CHECK-NEXT: asr.w r2, r0, #31
-; CHECK-NEXT: adc.w r3, r2, r0, asr #31
+; CHECK-NEXT: asr.w r3, r0, #31
+; CHECK-NEXT: adc.w r3, r3, r0, asr #31
; CHECK-NEXT: asrl r4, r3, r0
-; CHECK-NEXT: vmov q1[3], q1[1], r4, r12
-; CHECK-NEXT: vstrw.32 q1, [r1]
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: vmov r0, s0
+; CHECK-NEXT: adds r6, r0, r0
+; CHECK-NEXT: asr.w r3, r0, #31
+; CHECK-NEXT: adc.w r3, r3, r0, asr #31
+; CHECK-NEXT: asrl r6, r3, r0
+; CHECK-NEXT: vmov q0[2], q0[0], r6, r4
+; CHECK-NEXT: vmov q0[3], q0[1], r2, r12
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
%a = load <4 x i32>, <4 x i32> *%A, align 4
%sa = sext <4 x i32> %a to <4 x i64>
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
index f412204993ea2..879351b07b425 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
@@ -180,44 +180,44 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: ext_add_ashr_trunc_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, lr}
-; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: vmov.f32 s12, s6
; CHECK-NEXT: vmov.i64 q2, #0xffffffff
; CHECK-NEXT: vmov.f32 s6, s5
; CHECK-NEXT: vmov.f32 s14, s7
; CHECK-NEXT: vand q1, q1, q2
-; CHECK-NEXT: vmov r2, r3, d2
+; CHECK-NEXT: vmov r3, r7, d2
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov.f32 s4, s2
; CHECK-NEXT: vmov r0, r1, d6
; CHECK-NEXT: vmov.f32 s2, s3
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov r12, lr, d7
-; CHECK-NEXT: vmov r4, s4
-; CHECK-NEXT: adds r0, r0, r4
-; CHECK-NEXT: asr.w r5, r4, #31
+; CHECK-NEXT: vmov lr, r12, d7
+; CHECK-NEXT: vmov r2, s4
+; CHECK-NEXT: asrs r5, r2, #31
+; CHECK-NEXT: adds r2, r2, r0
+; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: adcs r1, r5
+; CHECK-NEXT: vmov r5, s0
+; CHECK-NEXT: lsrl r2, r1, #1
+; CHECK-NEXT: asrs r1, r0, #31
+; CHECK-NEXT: adds.w r0, r0, lr
+; CHECK-NEXT: adc.w r1, r1, r12
+; CHECK-NEXT: asrs r4, r5, #31
+; CHECK-NEXT: adds r6, r5, r3
+; CHECK-NEXT: vmov r3, r5, d3
+; CHECK-NEXT: vmov.f32 s6, s1
; CHECK-NEXT: lsrl r0, r1, #1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: adds r2, r2, r1
-; CHECK-NEXT: asr.w r4, r1, #31
-; CHECK-NEXT: adcs r3, r4
-; CHECK-NEXT: lsrl r2, r3, #1
-; CHECK-NEXT: vmov r1, r5, d3
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: adds.w r4, r3, r12
-; CHECK-NEXT: asr.w r6, r3, #31
-; CHECK-NEXT: adc.w r3, r6, lr
-; CHECK-NEXT: asrs r2, r0, #31
-; CHECK-NEXT: adds r0, r0, r1
+; CHECK-NEXT: adcs r7, r4
+; CHECK-NEXT: lsrl r6, r7, #1
+; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
+; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: adds r6, r1, r3
+; CHECK-NEXT: asr.w r2, r1, #31
; CHECK-NEXT: adc.w r1, r2, r5
-; CHECK-NEXT: lsrl r4, r3, #1
-; CHECK-NEXT: lsrl r0, r1, #1
-; CHECK-NEXT: vmov q0[3], q0[1], r0, r4
-; CHECK-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NEXT: lsrl r6, r1, #1
+; CHECK-NEXT: vmov q0[3], q0[1], r6, r0
+; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%sa = sext <4 x i32> %a to <4 x i64>
%sb = zext <4 x i32> %b to <4 x i64>
@@ -328,107 +328,98 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: ext_ops_trunc_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT: .pad #4
-; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: vmov.f32 s16, s2
-; CHECK-NEXT: vmov.i64 q3, #0xffffffff
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT: vmov.f32 s8, s2
; CHECK-NEXT: vmov.f32 s2, s3
-; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov.f32 s10, s7
-; CHECK-NEXT: vand q2, q2, q3
-; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vmov r1, r7, d4
-; CHECK-NEXT: vand q1, q1, q3
-; CHECK-NEXT: vmov r2, r12, d5
-; CHECK-NEXT: vmov r3, s16
+; CHECK-NEXT: vmov r10, s8
+; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov r6, s2
; CHECK-NEXT: vmov.f32 s2, s1
-; CHECK-NEXT: adds r0, r3, r1
-; CHECK-NEXT: asr.w r5, r3, #31
-; CHECK-NEXT: adcs r5, r7
-; CHECK-NEXT: asrl r0, r5, r1
-; CHECK-NEXT: subs.w lr, r0, r1
-; CHECK-NEXT: asr.w r0, r6, #31
-; CHECK-NEXT: sbc.w r8, r5, r7
-; CHECK-NEXT: adds r4, r6, r2
-; CHECK-NEXT: adc.w r5, r0, r12
-; CHECK-NEXT: movs r7, #0
-; CHECK-NEXT: asrl r4, r5, r2
+; CHECK-NEXT: vmov.f32 s6, s5
+; CHECK-NEXT: vmov r2, s8
+; CHECK-NEXT: asr.w r0, r10, #31
+; CHECK-NEXT: asrs r7, r6, #31
+; CHECK-NEXT: adds.w r4, r10, r2
+; CHECK-NEXT: adc r3, r0, #0
+; CHECK-NEXT: asrl r4, r3, r2
; CHECK-NEXT: subs r0, r4, r2
-; CHECK-NEXT: sbc.w r5, r5, r12
-; CHECK-NEXT: mov.w r12, #0
-; CHECK-NEXT: umull r0, r4, r0, r2
-; CHECK-NEXT: mla r5, r5, r2, r4
-; CHECK-NEXT: eor.w r4, r3, r1
-; CHECK-NEXT: orr.w r4, r4, r3, asr #31
-; CHECK-NEXT: cmp r4, #0
-; CHECK-NEXT: csetm r4, eq
-; CHECK-NEXT: bfi r7, r4, #0, #8
-; CHECK-NEXT: eor.w r4, r6, r2
-; CHECK-NEXT: orr.w r4, r4, r6, asr #31
-; CHECK-NEXT: rsbs r6, r6, #0
-; CHECK-NEXT: cmp r4, #0
-; CHECK-NEXT: lsll r0, r5, r6
-; CHECK-NEXT: csetm r4, eq
-; CHECK-NEXT: lsll r0, r5, r2
-; CHECK-NEXT: bfi r7, r4, #8, #8
-; CHECK-NEXT: rsbs r2, r3, #0
+; CHECK-NEXT: sbc lr, r3, #0
+; CHECK-NEXT: vmov r3, s10
+; CHECK-NEXT: umull r0, r8, r0, r2
+; CHECK-NEXT: adds r4, r6, r3
+; CHECK-NEXT: eor.w r1, r6, r3
+; CHECK-NEXT: adc r5, r7, #0
+; CHECK-NEXT: eor.w r7, r10, r2
+; CHECK-NEXT: asrl r4, r5, r3
+; CHECK-NEXT: orr.w r7, r7, r10, asr #31
+; CHECK-NEXT: subs r4, r4, r3
+; CHECK-NEXT: orr.w r1, r1, r6, asr #31
+; CHECK-NEXT: sbc r5, r5, #0
+; CHECK-NEXT: cmp r7, #0
+; CHECK-NEXT: umull r4, r12, r4, r3
+; CHECK-NEXT: csetm r9, eq
+; CHECK-NEXT: movs r7, #0
+; CHECK-NEXT: cmp r1, #0
+; CHECK-NEXT: bfi r7, r9, #0, #8
+; CHECK-NEXT: csetm r1, eq
+; CHECK-NEXT: bfi r7, r1, #8, #8
+; CHECK-NEXT: mla r5, r5, r3, r12
+; CHECK-NEXT: rsbs r1, r6, #0
; CHECK-NEXT: vmsr p0, r7
-; CHECK-NEXT: umull r4, r7, lr, r1
-; CHECK-NEXT: vmov r3, s0
-; CHECK-NEXT: mla r7, r8, r1, r7
-; CHECK-NEXT: lsll r4, r7, r2
-; CHECK-NEXT: vmov r2, lr, d3
-; CHECK-NEXT: lsll r4, r7, r1
-; CHECK-NEXT: vmov r1, r7, d2
-; CHECK-NEXT: vmov q4[2], q4[0], r4, r0
-; CHECK-NEXT: vpsel q2, q4, q2
-; CHECK-NEXT: asrs r0, r3, #31
-; CHECK-NEXT: adds r4, r3, r1
-; CHECK-NEXT: adc.w r5, r0, r7
-; CHECK-NEXT: asrl r4, r5, r1
-; CHECK-NEXT: subs r0, r4, r1
-; CHECK-NEXT: sbc.w r7, r5, r7
-; CHECK-NEXT: umull r0, r4, r0, r1
-; CHECK-NEXT: mla r9, r7, r1, r4
-; CHECK-NEXT: vmov r7, s2
-; CHECK-NEXT: adds r6, r7, r2
-; CHECK-NEXT: asr.w r4, r7, #31
-; CHECK-NEXT: adc.w r5, r4, lr
-; CHECK-NEXT: asrl r6, r5, r2
-; CHECK-NEXT: subs r4, r6, r2
-; CHECK-NEXT: sbc.w r6, r5, lr
-; CHECK-NEXT: eor.w r5, r3, r1
-; CHECK-NEXT: orr.w r5, r5, r3, asr #31
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: cmp r5, #0
-; CHECK-NEXT: lsll r0, r9, r3
-; CHECK-NEXT: csetm r5, eq
-; CHECK-NEXT: rsbs r3, r7, #0
-; CHECK-NEXT: bfi r12, r5, #0, #8
-; CHECK-NEXT: eor.w r5, r7, r2
-; CHECK-NEXT: orr.w r5, r5, r7, asr #31
-; CHECK-NEXT: lsll r0, r9, r1
-; CHECK-NEXT: cmp r5, #0
-; CHECK-NEXT: csetm r5, eq
-; CHECK-NEXT: bfi r12, r5, #8, #8
-; CHECK-NEXT: umull r4, r5, r4, r2
-; CHECK-NEXT: vmsr p0, r12
-; CHECK-NEXT: mla r5, r6, r2, r5
+; CHECK-NEXT: mla r7, lr, r2, r8
+; CHECK-NEXT: lsll r4, r5, r1
+; CHECK-NEXT: rsb.w r1, r10, #0
+; CHECK-NEXT: lsll r0, r7, r1
+; CHECK-NEXT: vmov lr, s2
+; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: lsll r0, r7, r2
; CHECK-NEXT: lsll r4, r5, r3
-; CHECK-NEXT: lsll r4, r5, r2
-; CHECK-NEXT: vmov q0[2], q0[0], r0, r4
+; CHECK-NEXT: mov.w r12, #0
+; CHECK-NEXT: vmov q3[2], q3[0], r0, r4
+; CHECK-NEXT: vpsel q2, q3, q2
+; CHECK-NEXT: adds.w r2, lr, r1
+; CHECK-NEXT: asr.w r0, lr, #31
+; CHECK-NEXT: adc r3, r0, #0
+; CHECK-NEXT: asrl r2, r3, r1
+; CHECK-NEXT: subs r0, r2, r1
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: sbc r7, r3, #0
+; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: umull r0, r6, r0, r1
+; CHECK-NEXT: asrs r5, r2, #31
+; CHECK-NEXT: adds r4, r2, r3
+; CHECK-NEXT: adc r5, r5, #0
+; CHECK-NEXT: asrl r4, r5, r3
+; CHECK-NEXT: subs r4, r4, r3
+; CHECK-NEXT: sbc r8, r5, #0
+; CHECK-NEXT: mla r5, r7, r1, r6
+; CHECK-NEXT: eor.w r6, lr, r1
+; CHECK-NEXT: orr.w r6, r6, lr, asr #31
+; CHECK-NEXT: eor.w r7, r2, r3
+; CHECK-NEXT: cmp r6, #0
+; CHECK-NEXT: orr.w r7, r7, r2, asr #31
+; CHECK-NEXT: csetm r6, eq
+; CHECK-NEXT: cmp r7, #0
+; CHECK-NEXT: csetm r7, eq
+; CHECK-NEXT: rsb.w lr, lr, #0
+; CHECK-NEXT: bfi r12, r7, #0, #8
+; CHECK-NEXT: lsll r0, r5, lr
+; CHECK-NEXT: bfi r12, r6, #8, #8
+; CHECK-NEXT: umull r4, r6, r4, r3
+; CHECK-NEXT: lsll r0, r5, r1
+; CHECK-NEXT: rsbs r1, r2, #0
+; CHECK-NEXT: vmsr p0, r12
+; CHECK-NEXT: mla r7, r8, r3, r6
+; CHECK-NEXT: lsll r4, r7, r1
+; CHECK-NEXT: lsll r4, r7, r3
+; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmov.f32 s1, s2
; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vmov.f32 s3, s10
-; CHECK-NEXT: vpop {d8, d9}
-; CHECK-NEXT: add sp, #4
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
entry:
%sa = sext <4 x i32> %a to <4 x i64>
%sb = zext <4 x i32> %b to <4 x i64>
diff --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
index 7a62d6d148167..9c283fb6298ed 100644
--- a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
@@ -57,19 +57,19 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vpt.s32 lt, q0, zr
; CHECK-NEXT: vldrwt.u32 q5, [r0]
-; CHECK-NEXT: vmov.f32 s2, s21
+; CHECK-NEXT: vmov.f32 s2, s23
+; CHECK-NEXT: vmov.f32 s16, s22
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: bl __aeabi_l2d
-; CHECK-NEXT: vmov r2, s20
+; CHECK-NEXT: vmov r2, s16
; CHECK-NEXT: vmov d9, r0, r1
; CHECK-NEXT: asrs r3, r2, #31
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: bl __aeabi_l2d
-; CHECK-NEXT: vmov.f32 s2, s23
+; CHECK-NEXT: vmov.f32 s2, s21
; CHECK-NEXT: vmov d8, r0, r1
-; CHECK-NEXT: vmov.f32 s20, s22
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: asrs r3, r2, #31
; CHECK-NEXT: mov r0, r2
@@ -82,8 +82,8 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: bl __aeabi_l2d
; CHECK-NEXT: vmov d10, r0, r1
-; CHECK-NEXT: vmov q0, q4
-; CHECK-NEXT: vmov q1, q5
+; CHECK-NEXT: vmov q1, q4
+; CHECK-NEXT: vmov q0, q5
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: pop {r7, pc}
entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
index ac4c6566ee414..9819a8253f345 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
@@ -401,26 +401,26 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
; CHECK-NEXT: subs r4, r4, r6
; CHECK-NEXT: sbc.w r9, r3, r6, asr #31
; CHECK-NEXT: vmov r6, s8
+; CHECK-NEXT: vmov r3, s6
; CHECK-NEXT: subs r5, r7, r6
+; CHECK-NEXT: asr.w r7, r7, #31
; CHECK-NEXT: vmov q2[2], q2[0], r5, r8
-; CHECK-NEXT: asr.w r5, r7, #31
-; CHECK-NEXT: sbc.w r5, r5, r6, asr #31
-; CHECK-NEXT: vmov r6, s14
-; CHECK-NEXT: vmov r7, s6
-; CHECK-NEXT: subs r3, r7, r6
-; CHECK-NEXT: vmov q2[3], q2[1], r4, r3
-; CHECK-NEXT: asr.w r3, r5, #31
-; CHECK-NEXT: mov.w r4, #0
-; CHECK-NEXT: bfi r4, r3, #0, #4
-; CHECK-NEXT: asr.w r3, r9, #31
-; CHECK-NEXT: bfi r4, r3, #4, #4
-; CHECK-NEXT: asr.w r3, r12, #31
-; CHECK-NEXT: bfi r4, r3, #8, #4
-; CHECK-NEXT: asr.w r3, r7, #31
-; CHECK-NEXT: sbc.w r3, r3, r6, asr #31
+; CHECK-NEXT: vmov r5, s14
+; CHECK-NEXT: sbc.w r6, r7, r6, asr #31
+; CHECK-NEXT: asrs r6, r6, #31
+; CHECK-NEXT: subs r7, r3, r5
+; CHECK-NEXT: asr.w r3, r3, #31
+; CHECK-NEXT: vmov q2[3], q2[1], r4, r7
+; CHECK-NEXT: mov.w r7, #0
+; CHECK-NEXT: sbc.w r3, r3, r5, asr #31
+; CHECK-NEXT: bfi r7, r6, #0, #4
+; CHECK-NEXT: asr.w r4, r9, #31
+; CHECK-NEXT: asr.w r6, r12, #31
+; CHECK-NEXT: bfi r7, r4, #4, #4
; CHECK-NEXT: asrs r3, r3, #31
-; CHECK-NEXT: bfi r4, r3, #12, #4
-; CHECK-NEXT: vmsr p0, r4
+; CHECK-NEXT: bfi r7, r6, #8, #4
+; CHECK-NEXT: bfi r7, r3, #12, #4
+; CHECK-NEXT: vmsr p0, r7
; CHECK-NEXT: vpst
; CHECK-NEXT: vsubt.i32 q2, q0, q2
; CHECK-NEXT: vstrb.8 q2, [r2], #16
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
index f66eb8584a0bd..217caeebe6335 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
@@ -232,34 +232,33 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: vmov.f32 s4, s1
+; CHECK-NEXT: vmov.f32 s6, s3
; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: vmov.f32 s4, s5
-; CHECK-NEXT: vmov.f32 s6, s7
-; CHECK-NEXT: umull lr, r12, r1, r0
+; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: umull r2, r5, r3, r0
-; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT: umull lr, r12, r1, r0
+; CHECK-NEXT: vmov q1[2], q1[0], r2, lr
; CHECK-NEXT: asrs r2, r0, #31
; CHECK-NEXT: mla r4, r1, r2, r12
; CHECK-NEXT: asrs r1, r1, #31
; CHECK-NEXT: mla r5, r3, r2, r5
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: mla r1, r1, r0, r4
-; CHECK-NEXT: vmov r4, s4
; CHECK-NEXT: mla r3, r3, r0, r5
-; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: umull r5, lr, r4, r0
-; CHECK-NEXT: umull r3, r12, r1, r0
-; CHECK-NEXT: vmov q1[2], q1[0], r5, r3
-; CHECK-NEXT: mla r3, r1, r2, r12
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r1
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: umull r3, r5, r1, r0
+; CHECK-NEXT: mla r5, r1, r2, r5
; CHECK-NEXT: asrs r1, r1, #31
-; CHECK-NEXT: mla r2, r4, r2, lr
-; CHECK-NEXT: mla r1, r1, r0, r3
-; CHECK-NEXT: asrs r3, r4, #31
-; CHECK-NEXT: mla r0, r3, r0, r2
-; CHECK-NEXT: vmov q1[3], q1[1], r0, r1
+; CHECK-NEXT: mla r12, r1, r0, r5
+; CHECK-NEXT: vmov r5, s0
+; CHECK-NEXT: umull r4, r1, r5, r0
+; CHECK-NEXT: mla r1, r5, r2, r1
+; CHECK-NEXT: asrs r2, r5, #31
+; CHECK-NEXT: vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT: mla r0, r2, r0, r1
+; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -276,34 +275,33 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vmov.f32 s4, s1
; CHECK-NEXT: asrs r4, r0, #31
-; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: vmov.f32 s6, s3
; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: vmov.f32 s4, s5
-; CHECK-NEXT: vmov.f32 s6, s7
-; CHECK-NEXT: umull lr, r12, r0, r1
+; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: umull r2, r5, r0, r3
-; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT: umull lr, r12, r0, r1
+; CHECK-NEXT: vmov q1[2], q1[0], r2, lr
; CHECK-NEXT: asrs r2, r1, #31
; CHECK-NEXT: mla r2, r0, r2, r12
; CHECK-NEXT: mla r1, r4, r1, r2
; CHECK-NEXT: asrs r2, r3, #31
; CHECK-NEXT: mla r2, r0, r2, r5
-; CHECK-NEXT: vmov r5, s4
; CHECK-NEXT: mla r2, r4, r3, r2
-; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
-; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: umull r3, lr, r0, r5
-; CHECK-NEXT: umull r2, r12, r0, r1
-; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT: asrs r2, r1, #31
-; CHECK-NEXT: mla r2, r0, r2, r12
-; CHECK-NEXT: mla r1, r4, r1, r2
-; CHECK-NEXT: asrs r2, r5, #31
-; CHECK-NEXT: mla r0, r0, r2, lr
-; CHECK-NEXT: mla r0, r4, r5, r0
-; CHECK-NEXT: vmov q1[3], q1[1], r0, r1
+; CHECK-NEXT: vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: umull r2, r3, r0, r1
+; CHECK-NEXT: asrs r5, r1, #31
+; CHECK-NEXT: mla r3, r0, r5, r3
+; CHECK-NEXT: mla r12, r4, r1, r3
+; CHECK-NEXT: vmov r3, s0
+; CHECK-NEXT: umull r5, r1, r0, r3
+; CHECK-NEXT: vmov q0[2], q0[0], r5, r2
+; CHECK-NEXT: asrs r2, r3, #31
+; CHECK-NEXT: mla r0, r0, r2, r1
+; CHECK-NEXT: mla r0, r4, r3, r0
+; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 4b28c2b07cacc..8bc247d9ebaf3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -8,21 +8,18 @@ define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
-; CHECK-NEXT: ldrd lr, r12, [r0]
-; CHECK-NEXT: ldrd r3, r2, [r0, #8]
+; CHECK-NEXT: ldrd r12, r3, [r0]
+; CHECK-NEXT: ldrd lr, r2, [r0, #8]
; CHECK-NEXT: ldrd r4, r0, [r0, #16]
-; CHECK-NEXT: vmov q1[2], q1[0], lr, r3
-; CHECK-NEXT: vmov q1[3], q1[1], r12, r2
-; CHECK-NEXT: vmov.32 q0[0], r4
-; CHECK-NEXT: vmov.f32 s8, s7
-; CHECK-NEXT: vmov.32 q0[1], r0
+; CHECK-NEXT: vmov.32 q1[1], r3
+; CHECK-NEXT: vmov q1[2], q1[0], r12, lr
+; CHECK-NEXT: strd r2, r0, [r1, #16]
+; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
+; CHECK-NEXT: vmov.f32 s8, s4
; CHECK-NEXT: vmov.f32 s9, s6
; CHECK-NEXT: vmov.f32 s10, s0
; CHECK-NEXT: vmov.f32 s11, s5
-; CHECK-NEXT: vmov r2, s8
-; CHECK-NEXT: vmov.f32 s8, s4
; CHECK-NEXT: vstrw.32 q2, [r1]
-; CHECK-NEXT: strd r2, r0, [r1, #16]
; CHECK-NEXT: pop {r4, pc}
entry:
%s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll
index 987fa7732e424..2f6576f29d0ac 100644
--- a/llvm/test/CodeGen/X86/combine-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll
@@ -349,19 +349,17 @@ define i64 @test_bitreverse_shli_bitreverse_i64(i64 %a) nounwind {
; X64-LABEL: test_bitreverse_shli_bitreverse_i64:
; X64: # %bb.0:
; X64-NEXT: bswapq %rdi
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shrq $4, %rax
-; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT: andq %rcx, %rax
-; X64-NEXT: andq %rcx, %rdi
-; X64-NEXT: shlq $4, %rdi
-; X64-NEXT: orq %rax, %rdi
-; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; X64-NEXT: movq %rdi, %rcx
-; X64-NEXT: andq %rax, %rcx
-; X64-NEXT: shrq $2, %rdi
-; X64-NEXT: andq %rax, %rdi
-; X64-NEXT: leaq (%rdi,%rcx,4), %rax
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
+; X64-NEXT: shll $4, %eax
+; X64-NEXT: shrl $4, %edi
+; X64-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
+; X64-NEXT: orl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $858993459, %eax # imm = 0x33333333
+; X64-NEXT: shrl $2, %edi
+; X64-NEXT: andl $858993459, %edi # imm = 0x33333333
+; X64-NEXT: leal (%rdi,%rax,4), %eax
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: andl $357913941, %ecx # imm = 0x15555555
; X64-NEXT: shrl %eax
diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll
index c7844698f8693..ec73b64b993b9 100644
--- a/llvm/test/CodeGen/X86/dagcombine-cse.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll
@@ -50,55 +50,59 @@ define i96 @square_high(i96 %x) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $8, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: adcl %edx, %ebp
+; X86-NEXT: setb %al
+; X86-NEXT: movzbl %al, %ecx
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ebp, %ebx
+; X86-NEXT: adcl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: addl %ebp, %ebx
+; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: setb %al
-; X86-NEXT: movzbl %al, %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl %ebp, %ebx
+; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: setb %cl
+; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: addb $255, %cl
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: adcl %edx, %ebx
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %eax
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
-; X86-NEXT: adcl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: adcl %ebp, %esi
-; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: setb %ah
+; X86-NEXT: addb $255, %al
+; X86-NEXT: adcl %edx, %ecx
+; X86-NEXT: movzbl %ah, %ebx
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: adcl %ebp, %esi
-; X86-NEXT: adcl %edi, %ebx
-; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %eax
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: adcl %edx, %ecx
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: adcl %edx, %ebx
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: addl $8, %esp
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %ebx, %ecx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index ddd6f002a0992..f9b3a19889e5f 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -556,18 +556,16 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movl $1, %eax
-; X86-SSE2-NEXT: movd %eax, %xmm2
; X86-SSE2-NEXT: pslld $23, %xmm1
; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; X86-SSE2-NEXT: retl
@@ -583,18 +581,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwi
;
; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movl $1, %eax
-; X64-SSE2-NEXT: movd %eax, %xmm2
; X64-SSE2-NEXT: pslld $23, %xmm1
; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-SSE2-NEXT: pand %xmm1, %xmm0
; X64-SSE2-NEXT: pxor %xmm1, %xmm1
; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; X64-SSE2-NEXT: retq
@@ -654,18 +650,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi
define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movl $1, %eax
-; X86-SSE2-NEXT: movd %eax, %xmm2
; X86-SSE2-NEXT: pslld $23, %xmm1
; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; X86-SSE2-NEXT: retl
@@ -681,18 +675,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi
;
; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movl $1, %eax
-; X64-SSE2-NEXT: movd %eax, %xmm2
; X64-SSE2-NEXT: pslld $23, %xmm1
; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-SSE2-NEXT: pand %xmm1, %xmm0
; X64-SSE2-NEXT: pxor %xmm1, %xmm1
; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; X64-SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index 6dfc15ed38e75..cb5d3b0ac21c0 100644
--- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -117,7 +117,7 @@ define void @i56_or(ptr %a) {
; X64-NEXT: movzwl 4(%rdi), %eax
; X64-NEXT: movzbl 6(%rdi), %ecx
; X64-NEXT: movb %cl, 6(%rdi)
-; X64-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
; X64-NEXT: shll $16, %ecx
; X64-NEXT: orl %eax, %ecx
; X64-NEXT: shlq $32, %rcx
@@ -149,7 +149,7 @@ define void @i56_and_or(ptr %a) {
; X64-NEXT: movzwl 4(%rdi), %eax
; X64-NEXT: movzbl 6(%rdi), %ecx
; X64-NEXT: movb %cl, 6(%rdi)
-; X64-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
; X64-NEXT: shll $16, %ecx
; X64-NEXT: orl %eax, %ecx
; X64-NEXT: shlq $32, %rcx
@@ -187,19 +187,18 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) {
; X64-NEXT: movzwl 4(%rdi), %ecx
; X64-NEXT: movzbl 6(%rdi), %edx
; X64-NEXT: movb %dl, 6(%rdi)
-; X64-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
+; X64-NEXT: # kill: def $edx killed $edx def $rdx
; X64-NEXT: shll $16, %edx
; X64-NEXT: orl %ecx, %edx
; X64-NEXT: shlq $32, %rdx
; X64-NEXT: movl (%rdi), %ecx
; X64-NEXT: orq %rdx, %rcx
; X64-NEXT: shlq $13, %rax
-; X64-NEXT: movabsq $72057594037919743, %rdx # imm = 0xFFFFFFFFFFDFFF
-; X64-NEXT: andq %rcx, %rdx
-; X64-NEXT: orq %rax, %rdx
-; X64-NEXT: movl %edx, (%rdi)
-; X64-NEXT: shrq $32, %rdx
-; X64-NEXT: movw %dx, 4(%rdi)
+; X64-NEXT: andq $-8193, %rcx # imm = 0xDFFF
+; X64-NEXT: orq %rax, %rcx
+; X64-NEXT: movl %ecx, (%rdi)
+; X64-NEXT: shrq $32, %rcx
+; X64-NEXT: movw %cx, 4(%rdi)
; X64-NEXT: retq
%extbit = zext i1 %bit to i56
%b = load i56, ptr %a, align 1
diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
index ecbdc2d91deae..8199c68616a9f 100644
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -191,16 +191,17 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $188, %esp
+; X86-NEXT: subl $184, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: negl %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: andl $1, %ebp
-; X86-NEXT: negl %ebp
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: negl %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -208,8 +209,9 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %edx, %ecx
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %ebx, %ebp
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %eax, %ecx
@@ -229,18 +231,18 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %edx, %edi
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %eax, %edi
@@ -269,139 +271,140 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %ebp
-; X86-NEXT: setb %cl
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %esi, %ebp
+; X86-NEXT: setb %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ebp, %esi
; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %edi, %ebx
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebp, %ecx
-; X86-NEXT: setb %bl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: adcl %ebp, %ecx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %edi
; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: adcl %esi, %ebx
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl %eax, %ebp
+; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: setb (%esp) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: adcl %ebx, %edi
-; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movzbl %bl, %ebx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: adcl %ebx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: adcl %ecx, %ebp
-; X86-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NEXT: adcl %ecx, %eax
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl %ebp, %esi
+; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -447,113 +450,118 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: addl %eax, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X86-NEXT: adcl %edx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT: adcl %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: addl %ebx, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: adcl %esi, %eax
; X86-NEXT: movl %ecx, %ebx
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl (%esp), %edx # 4-byte Reload
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl %ebp, %esi
+; X86-NEXT: addl %ebp, %edi
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: addl %ebx, %edi
; X86-NEXT: adcl %edx, %eax
; X86-NEXT: movl %eax, %edx
; X86-NEXT: setb %al
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %ebx
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movzbl %al, %esi
-; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: adcl %ecx, %edi
; X86-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NEXT: adcl $0, %eax
; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: adcl $0, %eax
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edx, %ebx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %edx, %edi
; X86-NEXT: setb %cl
-; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: addl %eax, %edi
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %edx, %ecx
; X86-NEXT: movl %eax, %edx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ebx, %edx
-; X86-NEXT: adcl %ecx, %ebp
-; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: addl %edi, %edx
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl %edi, %edx
+; X86-NEXT: addl %ebp, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: adcl %ebx, %ebp
; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: adcl $0, %eax
; X86-NEXT: addl %esi, %edx
; X86-NEXT: adcl %ecx, %eax
-; X86-NEXT: setb %bl
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %eax, %edx
-; X86-NEXT: adcl %edi, %edx
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %ebx, %eax
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $0, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebx
@@ -566,55 +574,54 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: movl %edi, %edx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: addl %ebx, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: setb %bl
+; X86-NEXT: setb %al
; X86-NEXT: addl %esi, %edx
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movzbl %al, %eax
; X86-NEXT: adcl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: adcl %esi, %edi
; X86-NEXT: setb %al
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movzbl %al, %edi
-; X86-NEXT: adcl %ebx, %edi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl %edi, %ebx
-; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %ebp, %edi
+; X86-NEXT: movzbl %al, %ebp
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl %ebp, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: adcl $0, %ebp
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl %ebx, %esi
-; X86-NEXT: adcl %ebp, %edi
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: addl %esi, %edi
+; X86-NEXT: adcl %ebx, %ebp
; X86-NEXT: setb %bl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: addl %ebp, %esi
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %edx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: imull %eax, %edx
@@ -628,36 +635,38 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: imull %eax, %edx
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: imull %ebx
; X86-NEXT: addl %eax, %eax
; X86-NEXT: adcl %edx, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl %ebp, %edx
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, %edi
@@ -679,127 +688,127 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebx, %ecx
; X86-NEXT: adcl %edx, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-NEXT: adcl $0, %eax
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: addl %edi, %ecx
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: addl %ebx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: adcl %ebp, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl %edi, %ebp
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: addl %ebx, %ebp
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT: adcl %ebp, %ebx
-; X86-NEXT: movl %ebx, %ebp
+; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl %ecx, (%esp) # 4-byte Folded Spill
; X86-NEXT: movl %ebp, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: adcl %edx, %esi
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl %eax, %esi
+; X86-NEXT: adcl %edx, %edi
; X86-NEXT: setb %al
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl %ecx, %edi
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: adcl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: imull %edx, %ecx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl %ecx, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: imull %edx, %ecx
-; X86-NEXT: addl %edi, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: imull %edx, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: imull %edx, %ebp
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: imull {{[0-9]+}}(%esp), %edx
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: addl %ebp, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: addl %eax, %ebx
; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: addl %esi, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %eax
+; X86-NEXT: adcl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: sarl $31, %ecx
; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: xorl %ecx, %ebp
-; X86-NEXT: orl %eax, %ebp
-; X86-NEXT: xorl %ecx, %esi
-; X86-NEXT: orl %ebp, %esi
+; X86-NEXT: xorl %ecx, %ebx
+; X86-NEXT: orl %eax, %ebx
; X86-NEXT: xorl %ecx, %edi
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: xorl %ecx, %eax
; X86-NEXT: xorl %ecx, %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: xorl %ecx, %ebx
-; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: xorl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: xorl %ecx, %edx
; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %ebx, %ecx
; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl %edi, %ecx
; X86-NEXT: orl %edx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: movl %edi, %edx
; X86-NEXT: andl $1, %edx
; X86-NEXT: movl %edx, %eax
; X86-NEXT: negl %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: xorl %eax, %ebx
+; X86-NEXT: xorl %eax, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: xorl %eax, %esi
-; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %ebp, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: xorl %eax, %ebx
; X86-NEXT: orl %esi, %ebx
@@ -817,7 +826,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: movl %ecx, 12(%eax)
; X86-NEXT: movb %dl, 16(%eax)
; X86-NEXT: setne 20(%eax)
-; X86-NEXT: addl $188, %esp
+; X86-NEXT: addl $184, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll b/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll
index a4d2b7295af62..7c1efa7c8b48a 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll
@@ -12,14 +12,13 @@ declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)
define i64 @func() nounwind {
; X64-LABEL: func:
; X64: # %bb.0:
-; X64-NEXT: movl $2, %ecx
-; X64-NEXT: movl $3, %eax
-; X64-NEXT: imulq %rcx
-; X64-NEXT: cmpq $2, %rdx
+; X64-NEXT: movl $2, %eax
+; X64-NEXT: negq %rax
; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
; X64-NEXT: movl $1, %ecx
; X64-NEXT: cmovgeq %rax, %rcx
-; X64-NEXT: cmpq $-2, %rdx
+; X64-NEXT: movq $-2, %rax
+; X64-NEXT: negq %rax
; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
; X64-NEXT: cmovgeq %rcx, %rax
; X64-NEXT: retq
@@ -42,16 +41,15 @@ define i64 @func2() nounwind {
define i64 @func3() nounwind {
; X64-LABEL: func3:
; X64: # %bb.0:
-; X64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT: movl $2, %edx
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: imulq %rdx
-; X64-NEXT: cmpq $2, %rdx
-; X64-NEXT: movabsq $4611686018427387903, %rsi # imm = 0x3FFFFFFFFFFFFFFF
-; X64-NEXT: cmovgeq %rcx, %rsi
-; X64-NEXT: cmpq $-2, %rdx
+; X64-NEXT: movl $2, %eax
+; X64-NEXT: negq %rax
+; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT: movabsq $4611686018427387903, %rcx # imm = 0x3FFFFFFFFFFFFFFF
+; X64-NEXT: cmovgeq %rax, %rcx
+; X64-NEXT: movq $-2, %rax
+; X64-NEXT: negq %rax
; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; X64-NEXT: cmovgeq %rsi, %rax
+; X64-NEXT: cmovgeq %rcx, %rax
; X64-NEXT: retq
%tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 2)
ret i64 %tmp
@@ -60,16 +58,15 @@ define i64 @func3() nounwind {
define i64 @func4() nounwind {
; X64-LABEL: func4:
; X64: # %bb.0:
-; X64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT: movl $2, %edx
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: imulq %rdx
-; X64-NEXT: cmpq $2147483647, %rdx # imm = 0x7FFFFFFF
-; X64-NEXT: movl $4294967295, %esi # imm = 0xFFFFFFFF
-; X64-NEXT: cmovgq %rcx, %rsi
-; X64-NEXT: cmpq $-2147483648, %rdx # imm = 0x80000000
+; X64-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT: negq %rax
+; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
+; X64-NEXT: cmovgq %rax, %rcx
+; X64-NEXT: movq $-2147483648, %rax # imm = 0x80000000
+; X64-NEXT: negq %rax
; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; X64-NEXT: cmovgeq %rsi, %rax
+; X64-NEXT: cmovgeq %rcx, %rax
; X64-NEXT: retq
%tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 32)
ret i64 %tmp
@@ -78,18 +75,15 @@ define i64 @func4() nounwind {
define i64 @func5() nounwind {
; X64-LABEL: func5:
; X64: # %bb.0:
-; X64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT: movl $2, %edx
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: imulq %rdx
; X64-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF
-; X64-NEXT: cmpq %rax, %rdx
-; X64-NEXT: movl $1, %esi
-; X64-NEXT: cmovgq %rcx, %rsi
+; X64-NEXT: negq %rax
+; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT: movl $1, %ecx
+; X64-NEXT: cmovgq %rax, %rcx
; X64-NEXT: movabsq $-4611686018427387904, %rax # imm = 0xC000000000000000
-; X64-NEXT: cmpq %rax, %rdx
+; X64-NEXT: negq %rax
; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; X64-NEXT: cmovgeq %rsi, %rax
+; X64-NEXT: cmovgeq %rcx, %rax
; X64-NEXT: retq
%tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 63)
ret i64 %tmp
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index a80931bfaa836..a5ab87f744cde 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -558,12 +558,11 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1]
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -572,7 +571,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pandn %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo:
@@ -648,19 +647,18 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <2147483648,u,268435456,u>
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,268435456,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -1135,7 +1133,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3067833783,u,1,u>
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,1,3067833783]
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -1143,19 +1141,18 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <2147483648,u,2,u>
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pxor %xmm5, %xmm3
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1
@@ -1379,12 +1376,11 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1]
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1393,7 +1389,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pandn %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo:
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index a9be2a5b9273e..e3477585f48bb 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -163,19 +163,18 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -241,19 +240,18 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_ne:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
@@ -479,21 +477,20 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo:
@@ -559,19 +556,18 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -926,21 +922,20 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN:
@@ -1006,19 +1001,18 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -1167,21 +1161,20 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo:
@@ -1842,21 +1835,20 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
@@ -1921,21 +1913,20 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
More information about the llvm-commits
mailing list