[llvm] 529bd4f - [DAG] SimplifyDemandedBits - don't early-out for multiple use values

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 27 02:54:24 PDT 2022


Author: Simon Pilgrim
Date: 2022-07-27T10:54:06+01:00
New Revision: 529bd4f3525d7ff564012597df204d8a8a4fb9ac

URL: https://github.com/llvm/llvm-project/commit/529bd4f3525d7ff564012597df204d8a8a4fb9ac
DIFF: https://github.com/llvm/llvm-project/commit/529bd4f3525d7ff564012597df204d8a8a4fb9ac.diff

LOG: [DAG] SimplifyDemandedBits - don't early-out for multiple use values

SimplifyDemandedBits currently early-outs for multi-use values beyond the root node (just returning the knownbits), which is missing a number of optimizations as there are plenty of cases where we can still simplify when initially demanding all elements/bits.

@lenary has confirmed that the test cases in aea-erratum-fix.ll need refactoring and the current increase codegen is not a major concern.

Differential Revision: https://reviews.llvm.org/D129765

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
    llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
    llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
    llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
    llvm/test/CodeGen/ARM/aes-erratum-fix.ll
    llvm/test/CodeGen/RISCV/rv32zbp.ll
    llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
    llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
    llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
    llvm/test/CodeGen/Thumb2/mve-vabdus.ll
    llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
    llvm/test/CodeGen/Thumb2/mve-vst3.ll
    llvm/test/CodeGen/X86/combine-bitreverse.ll
    llvm/test/CodeGen/X86/dagcombine-cse.ll
    llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
    llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
    llvm/test/CodeGen/X86/smul-with-overflow.ll
    llvm/test/CodeGen/X86/smul_fix_sat_constants.ll
    llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
    llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 6205e74837c04..102c412fe72fc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1089,6 +1089,10 @@ bool TargetLowering::SimplifyDemandedBits(
   if (Op.isUndef())
     return false;
 
+  // We can't simplify target constants.
+  if (Op.getOpcode() == ISD::TargetConstant)
+    return false;
+
   if (Op.getOpcode() == ISD::Constant) {
     // We know all of the bits for a constant!
     Known = KnownBits::makeConstant(cast<ConstantSDNode>(Op)->getAPIntValue());
@@ -1103,17 +1107,16 @@ bool TargetLowering::SimplifyDemandedBits(
   }
 
   // Other users may use these bits.
+  bool HasMultiUse = false;
   if (!Op.getNode()->hasOneUse() && !AssumeSingleUse) {
-    if (Depth != 0) {
-      // If not at the root, Just compute the Known bits to
-      // simplify things downstream.
-      Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+    if (Depth >= SelectionDAG::MaxRecursionDepth) {
+      // Limit search depth.
       return false;
     }
-    // If this is the root being simplified, allow it to have multiple uses,
-    // just set the DemandedBits/Elts to all bits.
+    // Allow multiple uses, just set the DemandedBits/Elts to all bits.
     DemandedBits = APInt::getAllOnes(BitWidth);
     DemandedElts = APInt::getAllOnes(NumElts);
+    HasMultiUse = true;
   } else if (OriginalDemandedBits == 0 || OriginalDemandedElts == 0) {
     // Not demanding any bits/elts from Op.
     return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
@@ -1124,8 +1127,6 @@ bool TargetLowering::SimplifyDemandedBits(
 
   KnownBits Known2;
   switch (Op.getOpcode()) {
-  case ISD::TargetConstant:
-    llvm_unreachable("Can't simplify this node");
   case ISD::SCALAR_TO_VECTOR: {
     if (!DemandedElts[0])
       return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
@@ -2715,6 +2716,12 @@ bool TargetLowering::SimplifyDemandedBits(
               APFloat(TLO.DAG.EVTToAPFloatSemantics(VT), Known.One), dl, VT));
   }
 
+  // A multi use 'all demanded elts' simplify failed to find any knownbits.
+  // Try again just for the original demanded elts.
+  // Ensure we do this AFTER constant folding above.
+  if (HasMultiUse && Known.isUnknown() && !OriginalDemandedElts.isAllOnes())
+    Known = TLO.DAG.computeKnownBits(Op, OriginalDemandedElts, Depth);
+
   return false;
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 6e30267162b96..427b1fed1f307 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -2616,36 +2616,36 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    mov w8, #1895825407
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov x25, #-34359738368
-; CHECK-NEXT:    mov x23, #34359738367
+; CHECK-NEXT:    mov x22, #34359738367
 ; CHECK-NEXT:    fmov s9, w8
 ; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    mov h0, v0.h[3]
-; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, x25, x1, lt
+; CHECK-NEXT:    csel x8, x25, x1, lt
+; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    csel x9, x23, x9, gt
-; CHECK-NEXT:    csinv x8, x8, xzr, le
+; CHECK-NEXT:    csinv x9, x9, xzr, le
+; CHECK-NEXT:    csel x8, x22, x8, gt
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csel x8, xzr, x8, vs
 ; CHECK-NEXT:    fmov s0, s8
-; CHECK-NEXT:    str x8, [sp, #24] // 8-byte Folded Spill
-; CHECK-NEXT:    csel x8, xzr, x9, vs
 ; CHECK-NEXT:    str x8, [sp, #72] // 8-byte Folded Spill
+; CHECK-NEXT:    csel x8, xzr, x9, vs
+; CHECK-NEXT:    str x8, [sp, #24] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    csel x9, x23, x9, gt
+; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csel x8, xzr, x8, vs
-; CHECK-NEXT:    csel x22, xzr, x9, vs
+; CHECK-NEXT:    csel x10, xzr, x8, vs
+; CHECK-NEXT:    csel x8, xzr, x9, vs
 ; CHECK-NEXT:    fmov s0, s8
-; CHECK-NEXT:    str x8, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x8, x10, [sp, #8] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, s10
@@ -2654,10 +2654,10 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x23, x8, gt
+; CHECK-NEXT:    csel x8, x22, x8, gt
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csel x24, xzr, x8, vs
+; CHECK-NEXT:    csel x26, xzr, x8, vs
 ; CHECK-NEXT:    csel x8, xzr, x9, vs
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    str x8, [sp, #32] // 8-byte Folded Spill
@@ -2669,40 +2669,39 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x23, x8, gt
+; CHECK-NEXT:    csel x8, x22, x8, gt
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csel x27, xzr, x8, vs
+; CHECK-NEXT:    csel x28, xzr, x8, vs
 ; CHECK-NEXT:    csel x8, xzr, x9, vs
 ; CHECK-NEXT:    fmov s0, s8
-; CHECK-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    mov h0, v0.h[3]
-; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, x25, x1, lt
+; CHECK-NEXT:    csel x8, x25, x1, lt
+; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    csel x9, x23, x9, gt
-; CHECK-NEXT:    csinv x8, x8, xzr, le
+; CHECK-NEXT:    csinv x9, x9, xzr, le
+; CHECK-NEXT:    csel x8, x22, x8, gt
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csel x8, xzr, x8, vs
-; CHECK-NEXT:    csel x29, xzr, x9, vs
+; CHECK-NEXT:    csel x27, xzr, x8, vs
+; CHECK-NEXT:    csel x20, xzr, x9, vs
 ; CHECK-NEXT:    fmov s0, s8
-; CHECK-NEXT:    str x8, [sp] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    csel x9, x23, x9, gt
+; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csel x20, xzr, x8, vs
-; CHECK-NEXT:    csel x28, xzr, x9, vs
+; CHECK-NEXT:    csel x29, xzr, x8, vs
+; CHECK-NEXT:    csel x21, xzr, x9, vs
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
@@ -2712,65 +2711,54 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x23, x8, gt
+; CHECK-NEXT:    csel x8, x22, x8, gt
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csel x21, xzr, x8, vs
-; CHECK-NEXT:    csel x26, xzr, x9, vs
+; CHECK-NEXT:    csel x23, xzr, x8, vs
+; CHECK-NEXT:    csel x24, xzr, x9, vs
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    fmov d0, x20
 ; CHECK-NEXT:    fcmp s8, s10
-; CHECK-NEXT:    ldr x11, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    lsr x10, x28, #28
-; CHECK-NEXT:    ldr d1, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    lsr x12, x29, #28
-; CHECK-NEXT:    mov v0.d[1], x28
+; CHECK-NEXT:    extr x9, x21, x29, #28
+; CHECK-NEXT:    bfi x23, x20, #36, #28
+; CHECK-NEXT:    extr x11, x27, x20, #28
+; CHECK-NEXT:    str x24, [x19]
 ; CHECK-NEXT:    csel x8, x25, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
+; CHECK-NEXT:    csel x10, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    stur x11, [x19, #75]
-; CHECK-NEXT:    ldr x13, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x23, x8, gt
+; CHECK-NEXT:    stur x9, [x19, #41]
+; CHECK-NEXT:    stp x23, x11, [x19, #8]
+; CHECK-NEXT:    lsr x11, x27, #28
+; CHECK-NEXT:    csinv x9, x10, xzr, le
+; CHECK-NEXT:    lsr x10, x21, #28
+; CHECK-NEXT:    csel x8, x22, x8, gt
 ; CHECK-NEXT:    fcmp s8, s8
-; CHECK-NEXT:    fmov x11, d0
-; CHECK-NEXT:    stur x13, [x19, #50]
-; CHECK-NEXT:    mov v1.d[1], x29
-; CHECK-NEXT:    ldr d0, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    csel x9, xzr, x9, vs
 ; CHECK-NEXT:    strb w10, [x19, #49]
-; CHECK-NEXT:    extr x10, x28, x11, #28
 ; CHECK-NEXT:    csel x8, xzr, x8, vs
-; CHECK-NEXT:    bfi x8, x11, #36, #28
-; CHECK-NEXT:    strb w12, [x19, #24]
+; CHECK-NEXT:    ldr x10, [sp] // 8-byte Folded Reload
+; CHECK-NEXT:    csel x9, xzr, x9, vs
+; CHECK-NEXT:    bfi x8, x29, #36, #28
+; CHECK-NEXT:    strb w11, [x19, #24]
+; CHECK-NEXT:    stur x10, [x19, #75]
+; CHECK-NEXT:    ldp x12, x11, [sp, #8] // 16-byte Folded Reload
 ; CHECK-NEXT:    stur x9, [x19, #25]
-; CHECK-NEXT:    fmov x12, d1
-; CHECK-NEXT:    stur x10, [x19, #41]
-; CHECK-NEXT:    lsr x9, x22, #28
-; CHECK-NEXT:    ldr d1, [sp, #24] // 8-byte Folded Reload
 ; CHECK-NEXT:    stur x8, [x19, #33]
+; CHECK-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    extr x10, x12, x11, #28
+; CHECK-NEXT:    bfi x28, x11, #36, #28
+; CHECK-NEXT:    stur x8, [x19, #50]
+; CHECK-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldr x11, [sp, #72] // 8-byte Folded Reload
-; CHECK-NEXT:    extr x18, x29, x12, #28
-; CHECK-NEXT:    mov v0.d[1], x22
-; CHECK-NEXT:    bfi x21, x12, #36, #28
-; CHECK-NEXT:    str x26, [x19]
-; CHECK-NEXT:    mov v1.d[1], x11
-; CHECK-NEXT:    lsr x10, x11, #28
-; CHECK-NEXT:    mov x13, x11
-; CHECK-NEXT:    stp x21, x18, [x19, #8]
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    strb w9, [x19, #99]
-; CHECK-NEXT:    strb w10, [x19, #74]
-; CHECK-NEXT:    fmov x11, d1
-; CHECK-NEXT:    extr x12, x22, x8, #28
-; CHECK-NEXT:    bfi x27, x8, #36, #28
-; CHECK-NEXT:    extr x8, x13, x11, #28
-; CHECK-NEXT:    bfi x24, x11, #36, #28
-; CHECK-NEXT:    stur x12, [x19, #91]
-; CHECK-NEXT:    stur x27, [x19, #83]
+; CHECK-NEXT:    stur x10, [x19, #91]
+; CHECK-NEXT:    stur x28, [x19, #83]
+; CHECK-NEXT:    extr x8, x11, x9, #28
+; CHECK-NEXT:    bfi x26, x9, #36, #28
+; CHECK-NEXT:    lsr x9, x12, #28
 ; CHECK-NEXT:    stur x8, [x19, #66]
-; CHECK-NEXT:    stur x24, [x19, #58]
+; CHECK-NEXT:    lsr x8, x11, #28
+; CHECK-NEXT:    stur x26, [x19, #58]
+; CHECK-NEXT:    strb w9, [x19, #99]
+; CHECK-NEXT:    strb w8, [x19, #74]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #176] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #160] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x24, x23, [sp, #144] // 16-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index 35b78615aa7f0..e669ea5a26522 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -2195,28 +2195,28 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w8, #1904214015
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    mov x21, #68719476735
+; CHECK-NEXT:    mov x23, #68719476735
 ; CHECK-NEXT:    mov h0, v0.h[3]
 ; CHECK-NEXT:    fmov s9, w8
-; CHECK-NEXT:    csel x8, xzr, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x20, x21, x8, gt
+; CHECK-NEXT:    csel x9, x23, x9, gt
+; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fmov s0, s8
-; CHECK-NEXT:    str x9, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x8, x9, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x8, xzr, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x23, x21, x8, gt
+; CHECK-NEXT:    csel x9, x23, x9, gt
+; CHECK-NEXT:    csinv x24, x8, xzr, le
 ; CHECK-NEXT:    fmov s0, s8
-; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, #0.0
@@ -2226,7 +2226,7 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    csel x24, x21, x9, gt
+; CHECK-NEXT:    csel x25, x23, x9, gt
 ; CHECK-NEXT:    str x8, [sp, #32] // 8-byte Folded Spill
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
@@ -2238,29 +2238,29 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    csel x26, x21, x9, gt
-; CHECK-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    csel x27, x23, x9, gt
+; CHECK-NEXT:    str x8, [sp] // 8-byte Folded Spill
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    mov h0, v0.h[3]
-; CHECK-NEXT:    csel x8, xzr, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csinv x29, x9, xzr, le
-; CHECK-NEXT:    csel x28, x21, x8, gt
+; CHECK-NEXT:    csel x29, x23, x9, gt
+; CHECK-NEXT:    csinv x26, x8, xzr, le
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x8, xzr, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csinv x27, x9, xzr, le
-; CHECK-NEXT:    csel x22, x21, x8, gt
+; CHECK-NEXT:    csel x28, x23, x9, gt
+; CHECK-NEXT:    csinv x20, x8, xzr, le
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
@@ -2270,58 +2270,46 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    csel x25, x21, x9, gt
-; CHECK-NEXT:    str x8, [sp] // 8-byte Folded Spill
+; CHECK-NEXT:    csel x21, x23, x9, gt
+; CHECK-NEXT:    csinv x22, x8, xzr, le
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    ldr x11, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    fmov d0, x27
-; CHECK-NEXT:    fmov d1, x29
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    lsr x10, x22, #28
-; CHECK-NEXT:    stur x11, [x19, #75]
-; CHECK-NEXT:    lsr x11, x28, #28
-; CHECK-NEXT:    mov v0.d[1], x22
-; CHECK-NEXT:    ldr x12, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    mov v1.d[1], x28
+; CHECK-NEXT:    extr x8, x28, x20, #28
+; CHECK-NEXT:    bfi x21, x26, #36, #28
+; CHECK-NEXT:    extr x9, x29, x26, #28
+; CHECK-NEXT:    lsr x11, x29, #28
+; CHECK-NEXT:    str x22, [x19]
+; CHECK-NEXT:    stur x8, [x19, #41]
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, xzr, x1, lt
+; CHECK-NEXT:    csel x10, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    stur x12, [x19, #50]
-; CHECK-NEXT:    fmov x12, d0
-; CHECK-NEXT:    fmov x13, d1
+; CHECK-NEXT:    stp x21, x9, [x19, #8]
+; CHECK-NEXT:    lsr x9, x28, #28
+; CHECK-NEXT:    strb w11, [x19, #24]
+; CHECK-NEXT:    bfi x27, x24, #36, #28
+; CHECK-NEXT:    csel x10, x23, x10, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    ldp d0, d1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x9, x21, x9, gt
-; CHECK-NEXT:    strb w10, [x19, #49]
-; CHECK-NEXT:    extr x10, x22, x12, #28
-; CHECK-NEXT:    bfi x9, x12, #36, #28
+; CHECK-NEXT:    bfi x10, x20, #36, #28
+; CHECK-NEXT:    strb w9, [x19, #49]
 ; CHECK-NEXT:    stur x8, [x19, #25]
-; CHECK-NEXT:    extr x8, x28, x13, #28
-; CHECK-NEXT:    mov v0.d[1], x23
-; CHECK-NEXT:    strb w11, [x19, #24]
-; CHECK-NEXT:    mov v1.d[1], x20
-; CHECK-NEXT:    stur x10, [x19, #41]
-; CHECK-NEXT:    stur x9, [x19, #33]
-; CHECK-NEXT:    bfi x25, x13, #36, #28
-; CHECK-NEXT:    str x8, [x19, #16]
-; CHECK-NEXT:    lsr x9, x23, #28
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    ldr x12, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    fmov x11, d1
-; CHECK-NEXT:    lsr x10, x20, #28
-; CHECK-NEXT:    strb w9, [x19, #99]
-; CHECK-NEXT:    stp x12, x25, [x19]
-; CHECK-NEXT:    extr x12, x23, x8, #28
-; CHECK-NEXT:    bfi x26, x8, #36, #28
-; CHECK-NEXT:    extr x8, x20, x11, #28
-; CHECK-NEXT:    bfi x24, x11, #36, #28
-; CHECK-NEXT:    strb w10, [x19, #74]
-; CHECK-NEXT:    stur x12, [x19, #91]
-; CHECK-NEXT:    stur x26, [x19, #83]
-; CHECK-NEXT:    stur x8, [x19, #66]
-; CHECK-NEXT:    stur x24, [x19, #58]
+; CHECK-NEXT:    stur x10, [x19, #33]
+; CHECK-NEXT:    ldp x9, x12, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    stur x9, [x19, #75]
+; CHECK-NEXT:    extr x8, x12, x24, #28
+; CHECK-NEXT:    ldr x9, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    stur x9, [x19, #50]
+; CHECK-NEXT:    ldp x11, x10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    stur x8, [x19, #91]
+; CHECK-NEXT:    lsr x8, x12, #28
+; CHECK-NEXT:    stur x27, [x19, #83]
+; CHECK-NEXT:    extr x9, x10, x11, #28
+; CHECK-NEXT:    bfi x25, x11, #36, #28
+; CHECK-NEXT:    strb w8, [x19, #99]
+; CHECK-NEXT:    stur x9, [x19, #66]
+; CHECK-NEXT:    lsr x9, x10, #28
+; CHECK-NEXT:    stur x25, [x19, #58]
+; CHECK-NEXT:    strb w9, [x19, #74]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x24, x23, [sp, #128] // 16-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index de75f84110a52..1883db6c3dde6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -2753,67 +2753,63 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s8, s6, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT:    v_mov_b32_e32 v4, s6
-; GFX6-NEXT:    v_alignbit_b32 v4, s7, v4, 16
+; GFX6-NEXT:    s_and_b32 s9, s6, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GFX6-NEXT:    s_and_b32 s8, s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX6-NEXT:    s_lshr_b32 s9, s6, 16
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, v5
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 16
-; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, v6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s9
+; GFX6-NEXT:    s_lshr_b32 s8, s4, 16
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s8
+; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_mad_f32 v2, -v3, v0, v2
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX6-NEXT:    v_mul_f32_e32 v2, v6, v7
-; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
-; GFX6-NEXT:    v_mad_f32 v2, -v2, v5, v6
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
+; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v5
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
+; GFX6-NEXT:    v_mad_f32 v1, -v1, v2, v4
 ; GFX6-NEXT:    s_and_b32 s6, s7, 0xffff
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX6-NEXT:    s_and_b32 s6, s5, 0xffff
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    s_lshr_b32 s4, s7, 16
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v2, v1
-; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s8, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
 ; GFX6-NEXT:    s_lshr_b32 s6, s5, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s6
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX6-NEXT:    v_mad_f32 v4, -v1, v3, v4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
-; GFX6-NEXT:    v_mul_f32_e32 v3, v6, v7
-; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, v6, v7
+; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_mad_f32 v3, -v3, v5, v6
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v6
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s4
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -3029,7 +3025,7 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; GFX6-NEXT:    s_sext_i32_i16 s6, s5
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v1, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v1
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    s_xor_b32 s4, s6, s4
@@ -3045,7 +3041,7 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
@@ -3280,74 +3276,73 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NEXT:    v_alignbit_b32 v2, s7, v2, 16
-; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v3
-; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 16
-; GFX6-NEXT:    v_bfe_i32 v5, v1, 0, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX6-NEXT:    s_ashr_i32 s9, s6, 16
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    v_xor_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
-; GFX6-NEXT:    v_mul_f32_e32 v5, v6, v7
-; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX6-NEXT:    v_mad_f32 v6, -v5, v4, v6
-; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
+; GFX6-NEXT:    s_lshr_b32 s8, s4, 16
+; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_or_b32_e32 v3, 1, v3
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX6-NEXT:    s_xor_b32 s4, s4, s9
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX6-NEXT:    s_or_b32 s4, s4, 1
+; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
+; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    s_sext_i32_i16 s4, s7
-; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s6
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    s_sext_i32_i16 s6, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s6, s4
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v3
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v1
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s4, s4, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s4
-; GFX6-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX6-NEXT:    v_mad_f32 v2, -v4, v3, v2
+; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GFX6-NEXT:    s_ashr_i32 s4, s7, 16
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v3|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s7
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
 ; GFX6-NEXT:    s_lshr_b32 s6, s7, 16
 ; GFX6-NEXT:    s_ashr_i32 s7, s5, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s7
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
 ; GFX6-NEXT:    s_xor_b32 s4, s7, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s4, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
+; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s6
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s6
 ; GFX6-NEXT:    s_lshr_b32 s4, s5, 16
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -3635,7 +3630,7 @@ define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -3719,7 +3714,7 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX6-NEXT:    s_lshr_b32 s3, s4, 8
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
@@ -3999,54 +3994,50 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s8, s6, 0xffff
+; GFX6-NEXT:    s_and_b32 s9, s6, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX6-NEXT:    s_and_b32 s8, s4, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
-; GFX6-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
-; GFX6-NEXT:    v_alignbit_b32 v2, s7, v2, 16
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, v5
-; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
-; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v4
-; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_alignbit_b32 v0, s5, v0, 16
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s6
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v0
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
+; GFX6-NEXT:    s_lshr_b32 s9, s6, 16
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s9
+; GFX6-NEXT:    s_lshr_b32 s8, s4, 16
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s8
+; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
+; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
+; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
-; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
-; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX6-NEXT:    v_mad_f32 v3, -v4, v5, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
 ; GFX6-NEXT:    s_and_b32 s4, s5, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
-; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
-; GFX6-NEXT:    v_mad_f32 v3, -v3, v6, v7
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v6
-; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s7
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v3
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_mul_f32_e32 v2, v5, v6
+; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v5
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s7
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s8, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -4225,7 +4216,7 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; GFX6-NEXT:    s_sext_i32_i16 s5, s5
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
@@ -4415,49 +4406,48 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_ashr_i32 s9, s6, 16
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NEXT:    v_alignbit_b32 v2, s7, v2, 16
-; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v3
-; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 16
-; GFX6-NEXT:    v_bfe_i32 v5, v1, 0, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    v_xor_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
-; GFX6-NEXT:    v_mul_f32_e32 v5, v6, v7
-; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
+; GFX6-NEXT:    s_lshr_b32 s8, s4, 16
+; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_mad_f32 v6, -v5, v4, v6
-; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX6-NEXT:    s_xor_b32 s4, s4, s9
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX6-NEXT:    s_or_b32 s4, s4, 1
+; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
+; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX6-NEXT:    s_sext_i32_i16 s4, s7
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s4
-; GFX6-NEXT:    v_or_b32_e32 v3, 1, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s6
 ; GFX6-NEXT:    s_sext_i32_i16 s6, s5
-; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    s_xor_b32 s4, s6, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s4, s4, 1
-; GFX6-NEXT:    v_mul_f32_e32 v5, v3, v5
-; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX6-NEXT:    v_mad_f32 v3, -v5, v4, v3
-; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX6-NEXT:    v_mov_b32_e32 v6, s4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s7
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
+; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, s4
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s7
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s8, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v3
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
@@ -5026,7 +5016,7 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf000f
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
 ; GFX6-NEXT:    s_xor_b32 s0, s1, s0
@@ -5251,7 +5241,7 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v4, s0
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0xf000f
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s0
@@ -5274,7 +5264,7 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v8, vcc
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v7, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v2
@@ -5287,11 +5277,11 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v5, s8
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX6-NEXT:    s_lshr_b32 s3, s2, 15
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v5
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
@@ -5404,7 +5394,7 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -5683,9 +5673,9 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
@@ -6490,7 +6480,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    s_xor_b32 s11, s0, s1
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s10
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s2
@@ -6504,7 +6494,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX6-NEXT:    s_add_i32 s1, s9, s0
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
@@ -6954,7 +6944,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s9
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
@@ -7134,9 +7124,9 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s9
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
 ; GFX6-NEXT:    v_mov_b32_e32 v5, 0x11f
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s9
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -8217,7 +8207,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
@@ -8548,9 +8538,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s11
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -9297,9 +9287,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s3, v2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s7, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s6, v5
@@ -10528,8 +10518,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v2
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s7, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index fdf6342c4b400..bbc023f92d099 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1735,94 +1735,94 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x5
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:6
 ; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
 ; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v6, v0, s[2:3] offset:1
-; GFX10-NEXT:    global_load_short_d16 v4, v0, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3] offset:1
+; GFX10-NEXT:    global_load_short_d16 v7, v0, s[2:3] offset:4
 ; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v6
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, v5
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v5, v4
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v4
+; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v5, v7
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx3 v7, v[4:6], s[0:1] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v7i8_to_v7f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:4
-; GFX9-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:6
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1] offset:6
+; GFX9-NEXT:    global_load_ushort v2, v0, s[0:1] offset:4
 ; GFX9-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:3
-; GFX9-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:2
-; GFX9-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:1
-; GFX9-NEXT:    global_load_ubyte v7, v0, s[0:1]
+; GFX9-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:2
+; GFX9-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:1
+; GFX9-NEXT:    global_load_ubyte v9, v0, s[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff, v1
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, v2
+; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v5, v2
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v7
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v7
-; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v5, v9
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v9
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v9
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
-; GFX9-NEXT:    global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx3 v10, v[4:6], s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v7i8_to_v7f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
-; GFX11-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_lshlrev_b32 v0, 3, v0
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    global_load_u8 v5, v0, s[2:3] offset:6
+; GFX11-NEXT:    global_load_u8 v4, v0, s[2:3] offset:6
 ; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3] offset:3
 ; GFX11-NEXT:    global_load_u8 v2, v0, s[2:3] offset:2
-; GFX11-NEXT:    global_load_u8 v6, v0, s[2:3] offset:1
-; GFX11-NEXT:    global_load_d16_b16 v4, v0, s[2:3] offset:4
+; GFX11-NEXT:    global_load_u8 v5, v0, s[2:3] offset:1
+; GFX11-NEXT:    global_load_d16_b16 v7, v0, s[2:3] offset:4
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v6
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, v5
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v5, v4
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v4, v4
+; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v5, v7
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b96 v7, v[4:6], s[0:1] offset:16
-; GFX11-NEXT:    global_store_b128 v7, v[0:3], s[0:1]
+; GFX11-NEXT:    global_store_b96 v8, v[4:6], s[0:1] offset:16
+; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
index f4c2b2f060fd9..267cea2181c23 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -51,11 +51,10 @@ define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 ; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_write2_b32 v1, v0, v2 offset1:4
 ; SI-NEXT:    v_sub_i32_e32 v0, vcc, 12, v1
-; SI-NEXT:    v_sub_i32_e32 v2, vcc, 16, v1
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_barrier
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v2
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, 28, v1
 ; SI-NEXT:    ds_read_b32 v0, v0
 ; SI-NEXT:    ds_read_b32 v3, v2
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
@@ -77,16 +76,13 @@ define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_barrier
-; CI-NEXT:    v_sub_i32_e32 v2, vcc, 16, v1
-; CI-NEXT:    ds_read_b32 v0, v0 offset:12
-; CI-NEXT:    ds_read_b32 v3, v2 offset:12
+; CI-NEXT:    ds_read2_b32 v[3:4], v0 offset0:3 offset1:7
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, 0
-; CI-NEXT:    s_waitcnt lgkmcnt(1)
-; CI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:16
+; CI-NEXT:    buffer_store_dword v3, v[1:2], s[0:3], 0 addr64
+; CI-NEXT:    buffer_store_dword v4, v[1:2], s[0:3], 0 addr64 offset:16
 ; CI-NEXT:    s_endpgm
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 21c8260cbfefd..14b89416a721a 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -38,8 +38,8 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out,
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; VI-NEXT:    v_mov_b32_e32 v1, v0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -85,8 +85,8 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out,
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; VI-NEXT:    v_mov_b32_e32 v1, v0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 20ece3d1c1a56..418dfbcb5cd2e 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -56,19 +56,22 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
 ; HAWAII-NEXT:    v_mov_b32_e32 v1, s5
 ; HAWAII-NEXT:    flat_load_ubyte v0, v[0:1]
-; HAWAII-NEXT:    s_load_dword s0, s[4:5], 0x0
-; HAWAII-NEXT:    s_load_dword s1, s[4:5], 0x2
-; HAWAII-NEXT:    s_load_dword s2, s[4:5], 0x3
+; HAWAII-NEXT:    s_load_dword s0, s[4:5], 0x3
+; HAWAII-NEXT:    s_load_dword s1, s[4:5], 0x0
+; HAWAII-NEXT:    s_load_dword s2, s[4:5], 0x2
 ; HAWAII-NEXT:    s_mov_b32 m0, -1
 ; HAWAII-NEXT:    s_waitcnt lgkmcnt(0)
-; HAWAII-NEXT:    v_mov_b32_e32 v1, s0
-; HAWAII-NEXT:    v_mov_b32_e32 v2, s1
+; HAWAII-NEXT:    s_and_b32 s3, s0, 0xffff
+; HAWAII-NEXT:    v_mov_b32_e32 v1, s1
+; HAWAII-NEXT:    v_mov_b32_e32 v2, s0
 ; HAWAII-NEXT:    v_mov_b32_e32 v3, s2
-; HAWAII-NEXT:    ds_write_b16 v1, v3 offset:4
+; HAWAII-NEXT:    ds_write_b16 v1, v2 offset:4
 ; HAWAII-NEXT:    s_waitcnt vmcnt(0)
-; HAWAII-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; HAWAII-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; HAWAII-NEXT:    v_or_b32_e32 v0, s3, v0
+; HAWAII-NEXT:    v_bfe_u32 v0, v0, 16, 7
 ; HAWAII-NEXT:    ds_write_b8 v1, v0 offset:6
-; HAWAII-NEXT:    ds_write_b32 v1, v2
+; HAWAII-NEXT:    ds_write_b32 v1, v3
 ; HAWAII-NEXT:    s_endpgm
 ;
 ; FIJI-LABEL: local_store_i55:

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index e1dc9904b8123..f55f000e7d074 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -675,7 +675,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xd
 ; GCN-NEXT:    s_load_dword s6, s[0:1], 0xc
-; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v2, 0xffff
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s3, s2, 0xffff
@@ -687,7 +687,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
 ; GCN-NEXT:    s_and_b32 s8, s6, 0xffff
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v2
+; GCN-NEXT:    v_mac_f32_e32 v1, 0, v2
 ; GCN-NEXT:    v_rcp_f32_e32 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s9, s0, 0xff000000

diff  --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 7e295cb0fb413..14d3503cce6e5 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -234,9 +234,8 @@ define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s1, s0, 0xffff
+; VI-NEXT:    s_add_i32 s1, s0, 12
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_add_i32 s1, s1, 12
 ; VI-NEXT:    v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; VI-NEXT:    s_or_b32 s0, s1, 4
 ; VI-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD

diff  --git a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll
index 32bc6bc67d955..afd9a929f75ad 100644
--- a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll
+++ b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll
@@ -1356,54 +1356,77 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
 ; CHECK-FIX-NOSCHED-NEXT:    .save {r4, r5, r6, r7, r8, lr}
 ; CHECK-FIX-NOSCHED-NEXT:    push {r4, r5, r6, r7, r8, lr}
 ; CHECK-FIX-NOSCHED-NEXT:    cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT:    beq .LBB36_2
+; CHECK-FIX-NOSCHED-NEXT:    beq .LBB36_3
 ; CHECK-FIX-NOSCHED-NEXT:  @ %bb.1:
 ; CHECK-FIX-NOSCHED-NEXT:    vld1.64 {d16, d17}, [r2]
-; CHECK-FIX-NOSCHED-NEXT:    vld1.16 {d16[0]}, [r1:16]
-; CHECK-FIX-NOSCHED-NEXT:    vmov r3, r4, d17
-; CHECK-FIX-NOSCHED-NEXT:    vmov lr, r12, d16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r3
-; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT:    vorr q9, q8, q8
+; CHECK-FIX-NOSCHED-NEXT:    vmov lr, r12, d17
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r3, d16[1]
+; CHECK-FIX-NOSCHED-NEXT:    vld1.16 {d18[0]}, [r1:16]
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r4, d18[0]
+; CHECK-FIX-NOSCHED-NEXT:    vmov s6, lr
+; CHECK-FIX-NOSCHED-NEXT:    lsr r5, r3, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s8, s6
-; CHECK-FIX-NOSCHED-NEXT:    vmov s4, r4
-; CHECK-FIX-NOSCHED-NEXT:    lsr r4, r4, #16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s10, r3
-; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r12
-; CHECK-FIX-NOSCHED-NEXT:    lsr r12, r12, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r3
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s12, s6
-; CHECK-FIX-NOSCHED-NEXT:    vmov s6, lr
+; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r12, #16
 ; CHECK-FIX-NOSCHED-NEXT:    lsr lr, lr, #16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s14, r12
-; CHECK-FIX-NOSCHED-NEXT:    vmov s7, lr
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s5, s6
-; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r4
+; CHECK-FIX-NOSCHED-NEXT:    vmov s4, r12
+; CHECK-FIX-NOSCHED-NEXT:    vmov s10, lr
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s4, s4
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT:    vmov s14, r5
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s10, s10
+; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r4
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s14, s14
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s5, s6
+; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r3
+; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r4, #16
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT:    vmov s7, r3
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s7, s7
 ; CHECK-FIX-NOSCHED-NEXT:    cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT:    bne .LBB36_3
-; CHECK-FIX-NOSCHED-NEXT:    b .LBB36_4
+; CHECK-FIX-NOSCHED-NEXT:    bne .LBB36_4
 ; CHECK-FIX-NOSCHED-NEXT:  .LBB36_2:
-; CHECK-FIX-NOSCHED-NEXT:    ldrh r4, [r2, #10]
-; CHECK-FIX-NOSCHED-NEXT:    ldrh lr, [r2, #6]
+; CHECK-FIX-NOSCHED-NEXT:    vmov r0, r1, d0
+; CHECK-FIX-NOSCHED-NEXT:    vmov r3, r7, d1
+; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r1
+; CHECK-FIX-NOSCHED-NEXT:    lsr r1, r1, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s0, r7
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s9, s1
+; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r3
+; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r0
+; CHECK-FIX-NOSCHED-NEXT:    lsr r7, r7, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s3, r3
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s11, s1
+; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r7
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FIX-NOSCHED-NEXT:    vmov s13, r1
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-FIX-NOSCHED-NEXT:    lsr r0, r0, #16
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s1, s1
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT:    b .LBB36_5
+; CHECK-FIX-NOSCHED-NEXT:  .LBB36_3:
+; CHECK-FIX-NOSCHED-NEXT:    ldrh r5, [r2, #10]
+; CHECK-FIX-NOSCHED-NEXT:    ldrh r12, [r2, #6]
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r6, [r2, #2]
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r7, [r2, #14]
-; CHECK-FIX-NOSCHED-NEXT:    vmov s8, r4
+; CHECK-FIX-NOSCHED-NEXT:    vmov s8, r5
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r3, [r2, #12]
-; CHECK-FIX-NOSCHED-NEXT:    vmov s12, lr
-; CHECK-FIX-NOSCHED-NEXT:    ldrh r12, [r2, #8]
+; CHECK-FIX-NOSCHED-NEXT:    vmov s12, r12
+; CHECK-FIX-NOSCHED-NEXT:    ldrh r4, [r2, #8]
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r6
-; CHECK-FIX-NOSCHED-NEXT:    ldrh r5, [r2, #4]
+; CHECK-FIX-NOSCHED-NEXT:    ldrh lr, [r2, #4]
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s4, r7
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r8, [r2]
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s6, s4
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s4, r3
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s10, s8
-; CHECK-FIX-NOSCHED-NEXT:    vmov s8, r12
+; CHECK-FIX-NOSCHED-NEXT:    vmov s8, r4
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s14, s12
-; CHECK-FIX-NOSCHED-NEXT:    vmov s12, r5
+; CHECK-FIX-NOSCHED-NEXT:    vmov s12, lr
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s7, s5
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r8
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s4, s4
@@ -1411,44 +1434,46 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s12, s12
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s5, s5
 ; CHECK-FIX-NOSCHED-NEXT:    cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT:    beq .LBB36_4
-; CHECK-FIX-NOSCHED-NEXT:  .LBB36_3:
-; CHECK-FIX-NOSCHED-NEXT:    vld1.16 {d0[0]}, [r1:16]
+; CHECK-FIX-NOSCHED-NEXT:    beq .LBB36_2
 ; CHECK-FIX-NOSCHED-NEXT:  .LBB36_4:
-; CHECK-FIX-NOSCHED-NEXT:    vmov r0, r1, d0
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s5, s5
-; CHECK-FIX-NOSCHED-NEXT:    vmov r3, r7, d1
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s12, s12
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s8, s8
-; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r1
-; CHECK-FIX-NOSCHED-NEXT:    lsr r1, r1, #16
+; CHECK-FIX-NOSCHED-NEXT:    vorr q8, q0, q0
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r3, d0[1]
+; CHECK-FIX-NOSCHED-NEXT:    vld1.16 {d16[0]}, [r1:16]
+; CHECK-FIX-NOSCHED-NEXT:    vmov r0, r1, d1
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r7, d16[0]
+; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r3
+; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s9, s1
-; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r0
+; CHECK-FIX-NOSCHED-NEXT:    vmov s13, r3
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT:    vmov s0, r1
+; CHECK-FIX-NOSCHED-NEXT:    lsr r1, r1, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r0
 ; CHECK-FIX-NOSCHED-NEXT:    lsr r0, r0, #16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s13, r1
-; CHECK-FIX-NOSCHED-NEXT:    vmov s15, r0
+; CHECK-FIX-NOSCHED-NEXT:    vmov s3, r0
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r7
+; CHECK-FIX-NOSCHED-NEXT:    lsr r0, r7, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s11, s1
+; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r1
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s1, s1
+; CHECK-FIX-NOSCHED-NEXT:  .LBB36_5:
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s5, s5
+; CHECK-FIX-NOSCHED-NEXT:    vmov s15, r0
 ; CHECK-FIX-NOSCHED-NEXT:    vmov r0, s5
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s5, s7
 ; CHECK-FIX-NOSCHED-NEXT:    vmov r1, s5
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s15, s15
-; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r3
-; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s5, s15
-; CHECK-FIX-NOSCHED-NEXT:    vmov s3, r3
-; CHECK-FIX-NOSCHED-NEXT:    vmov r3, s5
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s11, s11
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT:    vmov r3, s5
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s12, s12
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s5, s9
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s2, s2
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s8, s8
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s2, s2
-; CHECK-FIX-NOSCHED-NEXT:    vmov s0, r7
-; CHECK-FIX-NOSCHED-NEXT:    lsr r7, r7, #16
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r7
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s1, s1
 ; CHECK-FIX-NOSCHED-NEXT:    pkhbt r0, r0, r1, lsl #16
 ; CHECK-FIX-NOSCHED-NEXT:    vmov r1, s11
 ; CHECK-FIX-NOSCHED-NEXT:    vmov.32 d16[0], r0
@@ -1494,39 +1519,63 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
 ; CHECK-CORTEX-FIX:       @ %bb.0:
 ; CHECK-CORTEX-FIX-NEXT:    .save {r4, r5, r6, r7, r8, lr}
 ; CHECK-CORTEX-FIX-NEXT:    push {r4, r5, r6, r7, r8, lr}
-; CHECK-CORTEX-FIX-NEXT:    .vsave {d8}
-; CHECK-CORTEX-FIX-NEXT:    vpush {d8}
+; CHECK-CORTEX-FIX-NEXT:    .vsave {d8, d9}
+; CHECK-CORTEX-FIX-NEXT:    vpush {d8, d9}
 ; CHECK-CORTEX-FIX-NEXT:    cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT:    beq .LBB36_2
+; CHECK-CORTEX-FIX-NEXT:    beq .LBB36_3
 ; CHECK-CORTEX-FIX-NEXT:  @ %bb.1:
 ; CHECK-CORTEX-FIX-NEXT:    vld1.64 {d16, d17}, [r2]
-; CHECK-CORTEX-FIX-NEXT:    vld1.16 {d16[0]}, [r1:16]
+; CHECK-CORTEX-FIX-NEXT:    vorr q9, q8, q8
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 r3, d16[1]
 ; CHECK-CORTEX-FIX-NEXT:    vmov r5, r6, d17
+; CHECK-CORTEX-FIX-NEXT:    vld1.16 {d18[0]}, [r1:16]
 ; CHECK-CORTEX-FIX-NEXT:    lsr r7, r5, #16
 ; CHECK-CORTEX-FIX-NEXT:    lsr r4, r6, #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov s4, r6
 ; CHECK-CORTEX-FIX-NEXT:    vmov s6, r5
-; CHECK-CORTEX-FIX-NEXT:    vmov s14, r4
-; CHECK-CORTEX-FIX-NEXT:    vmov s7, r7
-; CHECK-CORTEX-FIX-NEXT:    vmov r12, r3, d16
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s4
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s4, s6
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s14, s14
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s6, s7
-; CHECK-CORTEX-FIX-NEXT:    lsr lr, r12, #16
 ; CHECK-CORTEX-FIX-NEXT:    lsr r8, r3, #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov s8, r3
-; CHECK-CORTEX-FIX-NEXT:    vmov s10, r12
+; CHECK-CORTEX-FIX-NEXT:    vmov s12, r4
+; CHECK-CORTEX-FIX-NEXT:    vmov s5, r7
 ; CHECK-CORTEX-FIX-NEXT:    vmov s9, r8
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s4
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s4, s6
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 lr, d18[0]
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s6, s5
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s7, s8
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s14, s12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s5, s9
+; CHECK-CORTEX-FIX-NEXT:    lsr r12, lr, #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov s11, lr
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s5, s8
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s10
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s7, s9
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s11
+; CHECK-CORTEX-FIX-NEXT:    vmov s13, r12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s11
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s13
 ; CHECK-CORTEX-FIX-NEXT:    cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT:    bne .LBB36_3
-; CHECK-CORTEX-FIX-NEXT:    b .LBB36_4
+; CHECK-CORTEX-FIX-NEXT:    bne .LBB36_4
 ; CHECK-CORTEX-FIX-NEXT:  .LBB36_2:
+; CHECK-CORTEX-FIX-NEXT:    vmov r6, r5, d1
+; CHECK-CORTEX-FIX-NEXT:    vmov r0, r1, d0
+; CHECK-CORTEX-FIX-NEXT:    lsr r7, r1, #16
+; CHECK-CORTEX-FIX-NEXT:    lsr r4, r6, #16
+; CHECK-CORTEX-FIX-NEXT:    lsr r3, r5, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s2, r6
+; CHECK-CORTEX-FIX-NEXT:    vmov s0, r5
+; CHECK-CORTEX-FIX-NEXT:    vmov s3, r1
+; CHECK-CORTEX-FIX-NEXT:    vmov s9, r0
+; CHECK-CORTEX-FIX-NEXT:    lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT:    vmov s15, r4
+; CHECK-CORTEX-FIX-NEXT:    vmov s16, r7
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s1, s0
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s11, s3
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s3, s9
+; CHECK-CORTEX-FIX-NEXT:    vmov s0, r12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s9, s15
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s15, s16
+; CHECK-CORTEX-FIX-NEXT:    b .LBB36_5
+; CHECK-CORTEX-FIX-NEXT:  .LBB36_3:
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r12, [r2]
 ; CHECK-CORTEX-FIX-NEXT:    ldrh lr, [r2, #2]
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r8, [r2, #4]
@@ -1535,84 +1584,86 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r3, [r2, #10]
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r7, [r2, #12]
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r6, [r2, #14]
+; CHECK-CORTEX-FIX-NEXT:    vmov s5, r5
+; CHECK-CORTEX-FIX-NEXT:    vmov s7, r8
 ; CHECK-CORTEX-FIX-NEXT:    vmov s4, r6
 ; CHECK-CORTEX-FIX-NEXT:    vmov s6, r7
-; CHECK-CORTEX-FIX-NEXT:    vmov s5, r5
 ; CHECK-CORTEX-FIX-NEXT:    vmov s8, r3
-; CHECK-CORTEX-FIX-NEXT:    vmov s10, r4
-; CHECK-CORTEX-FIX-NEXT:    vmov s9, r8
-; CHECK-CORTEX-FIX-NEXT:    vmov s11, lr
-; CHECK-CORTEX-FIX-NEXT:    vmov s13, r12
+; CHECK-CORTEX-FIX-NEXT:    vmov s12, r4
+; CHECK-CORTEX-FIX-NEXT:    vmov s9, lr
+; CHECK-CORTEX-FIX-NEXT:    vmov s11, r12
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s14, s4
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s6
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s6
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s6, s8
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s4, s10
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s7, s5
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s5, s9
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s11
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s13
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s4, s12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s5, s5
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s7, s7
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s9
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s11
 ; CHECK-CORTEX-FIX-NEXT:    cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT:    beq .LBB36_4
-; CHECK-CORTEX-FIX-NEXT:  .LBB36_3:
-; CHECK-CORTEX-FIX-NEXT:    vld1.16 {d0[0]}, [r1:16]
+; CHECK-CORTEX-FIX-NEXT:    beq .LBB36_2
 ; CHECK-CORTEX-FIX-NEXT:  .LBB36_4:
+; CHECK-CORTEX-FIX-NEXT:    vorr q8, q0, q0
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 r3, d0[1]
 ; CHECK-CORTEX-FIX-NEXT:    vmov r6, r5, d1
-; CHECK-CORTEX-FIX-NEXT:    vmov r0, r1, d0
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s12, s12
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s14
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s5, s5
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s4, s4
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s6, s6
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s8, s8
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s10, s10
-; CHECK-CORTEX-FIX-NEXT:    lsr r3, r5, #16
-; CHECK-CORTEX-FIX-NEXT:    vmov s0, r5
-; CHECK-CORTEX-FIX-NEXT:    lsr r7, r1, #16
+; CHECK-CORTEX-FIX-NEXT:    vld1.16 {d16[0]}, [r1:16]
 ; CHECK-CORTEX-FIX-NEXT:    lsr r4, r6, #16
-; CHECK-CORTEX-FIX-NEXT:    vmov s1, r1
-; CHECK-CORTEX-FIX-NEXT:    lsr r12, r0, #16
-; CHECK-CORTEX-FIX-NEXT:    vmov s9, r0
-; CHECK-CORTEX-FIX-NEXT:    vmov r0, s12
-; CHECK-CORTEX-FIX-NEXT:    vmov r1, s14
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s12, s7
+; CHECK-CORTEX-FIX-NEXT:    lsr r1, r5, #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov s2, r6
-; CHECK-CORTEX-FIX-NEXT:    vmov r5, s6
-; CHECK-CORTEX-FIX-NEXT:    vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT:    vmov s0, r5
+; CHECK-CORTEX-FIX-NEXT:    lsr r7, r3, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s3, r3
+; CHECK-CORTEX-FIX-NEXT:    vmov s9, r1
 ; CHECK-CORTEX-FIX-NEXT:    vmov s15, r4
 ; CHECK-CORTEX-FIX-NEXT:    vmov s16, r7
-; CHECK-CORTEX-FIX-NEXT:    vmov r4, s10
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s3, s0
-; CHECK-CORTEX-FIX-NEXT:    vmov s0, r12
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s11, s1
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s1, s9
-; CHECK-CORTEX-FIX-NEXT:    pkhbt r12, r0, r1, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r1, s5
-; CHECK-CORTEX-FIX-NEXT:    vmov r3, s12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s1, s0
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s2, s2
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 r0, d16[0]
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s13, s9
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s9, s15
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s11, s3
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s15, s16
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s3
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s7, s11
-; CHECK-CORTEX-FIX-NEXT:    pkhbt lr, r1, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s18, r0
+; CHECK-CORTEX-FIX-NEXT:    lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s0, r12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s3, s18
+; CHECK-CORTEX-FIX-NEXT:  .LBB36_5:
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s10, s10
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s14
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s7, s7
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s6, s6
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s4, s4
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s8, s8
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s12, s12
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s0, s0
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s2, s2
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s1, s1
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s3, s13
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s11, s15
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s9, s9
-; CHECK-CORTEX-FIX-NEXT:    vmov r3, s14
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s3, s3
+; CHECK-CORTEX-FIX-NEXT:    vmov r0, s10
+; CHECK-CORTEX-FIX-NEXT:    vmov r1, s14
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s10, s5
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s1
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s1, s13
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s5, s11
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s11, s15
+; CHECK-CORTEX-FIX-NEXT:    vmov r5, s6
+; CHECK-CORTEX-FIX-NEXT:    vmov r4, s8
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-CORTEX-FIX-NEXT:    vmov r7, s3
+; CHECK-CORTEX-FIX-NEXT:    pkhbt r12, r0, r1, lsl #16
+; CHECK-CORTEX-FIX-NEXT:    vmov r1, s7
+; CHECK-CORTEX-FIX-NEXT:    vmov r3, s10
+; CHECK-CORTEX-FIX-NEXT:    vmov r7, s1
 ; CHECK-CORTEX-FIX-NEXT:    vmov r6, s11
 ; CHECK-CORTEX-FIX-NEXT:    vmov r0, s9
+; CHECK-CORTEX-FIX-NEXT:    pkhbt lr, r1, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT:    vmov r3, s14
 ; CHECK-CORTEX-FIX-NEXT:    vmov r1, s0
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r3, r3, r7, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r7, s7
+; CHECK-CORTEX-FIX-NEXT:    vmov r7, s5
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r7, r7, r6, lsl #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov r6, s4
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r6, r6, r5, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r5, s8
+; CHECK-CORTEX-FIX-NEXT:    vmov r5, s12
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r5, r5, r4, lsl #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov r4, s2
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d18[0], r5
@@ -1620,7 +1671,7 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d18[1], lr
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d19[1], r12
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r0, r4, r0, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r4, s1
+; CHECK-CORTEX-FIX-NEXT:    vmov r4, s3
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r1, r4, r1, lsl #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d16[0], r1
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d17[0], r0
@@ -1629,7 +1680,7 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
 ; CHECK-CORTEX-FIX-NEXT:    aese.8 q9, q8
 ; CHECK-CORTEX-FIX-NEXT:    aesmc.8 q8, q9
 ; CHECK-CORTEX-FIX-NEXT:    vst1.64 {d16, d17}, [r2]
-; CHECK-CORTEX-FIX-NEXT:    vpop {d8}
+; CHECK-CORTEX-FIX-NEXT:    vpop {d8, d9}
 ; CHECK-CORTEX-FIX-NEXT:    pop {r4, r5, r6, r7, r8, pc}
   br i1 %0, label %5, label %12
 
@@ -1680,56 +1731,78 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_val(i1 zeroext %0, half %1, <1
 ; CHECK-FIX-NOSCHED-NEXT:    push {r4, r5, r6, r7, r11, lr}
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s9, s0
 ; CHECK-FIX-NOSCHED-NEXT:    cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT:    beq .LBB37_2
+; CHECK-FIX-NOSCHED-NEXT:    beq .LBB37_3
 ; CHECK-FIX-NOSCHED-NEXT:  @ %bb.1:
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s0, s9
 ; CHECK-FIX-NOSCHED-NEXT:    vld1.64 {d16, d17}, [r1]
 ; CHECK-FIX-NOSCHED-NEXT:    vmov r2, s0
+; CHECK-FIX-NOSCHED-NEXT:    vmov lr, r12, d17
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r3, d16[1]
 ; CHECK-FIX-NOSCHED-NEXT:    vmov.16 d16[0], r2
-; CHECK-FIX-NOSCHED-NEXT:    vmov r3, lr, d17
-; CHECK-FIX-NOSCHED-NEXT:    vmov r2, r12, d16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r3
-; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s2, lr
+; CHECK-FIX-NOSCHED-NEXT:    lsr lr, lr, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s8, s2
-; CHECK-FIX-NOSCHED-NEXT:    lsr r4, lr, #16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s0, lr
-; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r12
-; CHECK-FIX-NOSCHED-NEXT:    lsr r5, r2, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r3
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r2, d16[0]
+; CHECK-FIX-NOSCHED-NEXT:    lsr r4, r3, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s12, s2
-; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r2
-; CHECK-FIX-NOSCHED-NEXT:    lsr r2, r12, #16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s10, r3
-; CHECK-FIX-NOSCHED-NEXT:    vmov s14, r2
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s1, s2
-; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r4
+; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r12, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s0, r12
+; CHECK-FIX-NOSCHED-NEXT:    vmov s10, lr
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-FIX-NOSCHED-NEXT:    vmov s3, r5
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-FIX-NOSCHED-NEXT:    vmov s14, r4
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s10, s10
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s14, s14
+; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r2
+; CHECK-FIX-NOSCHED-NEXT:    lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s3, r2
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s1, s2
+; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r3
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s2, s2
 ; CHECK-FIX-NOSCHED-NEXT:    cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT:    bne .LBB37_3
-; CHECK-FIX-NOSCHED-NEXT:    b .LBB37_4
+; CHECK-FIX-NOSCHED-NEXT:    bne .LBB37_4
 ; CHECK-FIX-NOSCHED-NEXT:  .LBB37_2:
+; CHECK-FIX-NOSCHED-NEXT:    vmov r0, r2, d2
+; CHECK-FIX-NOSCHED-NEXT:    vmov r3, r7, d3
+; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r2
+; CHECK-FIX-NOSCHED-NEXT:    lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s4, r7
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s9, s5
+; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r3
+; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r0
+; CHECK-FIX-NOSCHED-NEXT:    lsr r7, r7, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s7, r3
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s11, s5
+; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r7
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s4, s4
+; CHECK-FIX-NOSCHED-NEXT:    vmov s13, r2
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT:    lsr r0, r0, #16
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s5, s5
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s7, s7
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT:    b .LBB37_5
+; CHECK-FIX-NOSCHED-NEXT:  .LBB37_3:
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r3, [r1, #10]
-; CHECK-FIX-NOSCHED-NEXT:    ldrh r4, [r1, #6]
-; CHECK-FIX-NOSCHED-NEXT:    ldrh lr, [r1, #2]
+; CHECK-FIX-NOSCHED-NEXT:    ldrh r12, [r1, #6]
+; CHECK-FIX-NOSCHED-NEXT:    ldrh r5, [r1, #2]
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r7, [r1, #14]
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s8, r3
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r2, [r1, #12]
-; CHECK-FIX-NOSCHED-NEXT:    vmov s12, r4
-; CHECK-FIX-NOSCHED-NEXT:    ldrh r5, [r1, #8]
-; CHECK-FIX-NOSCHED-NEXT:    vmov s1, lr
-; CHECK-FIX-NOSCHED-NEXT:    ldrh r12, [r1, #4]
+; CHECK-FIX-NOSCHED-NEXT:    vmov s12, r12
+; CHECK-FIX-NOSCHED-NEXT:    ldrh r4, [r1, #8]
+; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r5
+; CHECK-FIX-NOSCHED-NEXT:    ldrh lr, [r1, #4]
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s0, r7
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r6, [r1]
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s2, s0
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s0, r2
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s10, s8
-; CHECK-FIX-NOSCHED-NEXT:    vmov s8, r5
+; CHECK-FIX-NOSCHED-NEXT:    vmov s8, r4
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s14, s12
-; CHECK-FIX-NOSCHED-NEXT:    vmov s12, r12
+; CHECK-FIX-NOSCHED-NEXT:    vmov s12, lr
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s3, s1
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r6
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s0, s0
@@ -1737,47 +1810,48 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_val(i1 zeroext %0, half %1, <1
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s12, s12
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s1, s1
 ; CHECK-FIX-NOSCHED-NEXT:    cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT:    beq .LBB37_4
-; CHECK-FIX-NOSCHED-NEXT:  .LBB37_3:
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s9, s9
-; CHECK-FIX-NOSCHED-NEXT:    vmov r0, s9
-; CHECK-FIX-NOSCHED-NEXT:    vmov.16 d2[0], r0
+; CHECK-FIX-NOSCHED-NEXT:    beq .LBB37_2
 ; CHECK-FIX-NOSCHED-NEXT:  .LBB37_4:
-; CHECK-FIX-NOSCHED-NEXT:    vmov r0, r2, d2
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s1, s1
-; CHECK-FIX-NOSCHED-NEXT:    vmov r3, r7, d3
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s12, s12
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s8, s8
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r2
-; CHECK-FIX-NOSCHED-NEXT:    lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s9, s9
+; CHECK-FIX-NOSCHED-NEXT:    vmov r0, r2, d3
+; CHECK-FIX-NOSCHED-NEXT:    vmov r7, s9
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r3, d2[1]
+; CHECK-FIX-NOSCHED-NEXT:    vmov.16 d2[0], r7
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r7, d2[0]
+; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r3
+; CHECK-FIX-NOSCHED-NEXT:    vmov s4, r2
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s9, s5
-; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r0
+; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r0
 ; CHECK-FIX-NOSCHED-NEXT:    lsr r0, r0, #16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s13, r2
-; CHECK-FIX-NOSCHED-NEXT:    vmov s15, r0
+; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT:    lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s7, r0
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s4, s4
+; CHECK-FIX-NOSCHED-NEXT:    vmov s13, r3
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s7, s7
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r7
+; CHECK-FIX-NOSCHED-NEXT:    lsr r0, r7, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s11, s5
+; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r2
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s5, s5
+; CHECK-FIX-NOSCHED-NEXT:  .LBB37_5:
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s1, s1
+; CHECK-FIX-NOSCHED-NEXT:    vmov s15, r0
 ; CHECK-FIX-NOSCHED-NEXT:    vmov r0, s1
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s1, s3
 ; CHECK-FIX-NOSCHED-NEXT:    vmov r2, s1
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s15, s15
-; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r3
-; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s1, s15
-; CHECK-FIX-NOSCHED-NEXT:    vmov s7, r3
-; CHECK-FIX-NOSCHED-NEXT:    vmov r3, s1
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s11, s11
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT:    vmov r3, s1
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s12, s12
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s1, s9
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s6, s6
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s7, s7
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s8, s8
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s6, s6
-; CHECK-FIX-NOSCHED-NEXT:    vmov s4, r7
-; CHECK-FIX-NOSCHED-NEXT:    lsr r7, r7, #16
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s4, s4
-; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r7
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s4, s4
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s5, s5
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-FIX-NOSCHED-NEXT:    pkhbt r0, r0, r2, lsl #16
 ; CHECK-FIX-NOSCHED-NEXT:    vmov r2, s11
 ; CHECK-FIX-NOSCHED-NEXT:    vmov.32 d16[0], r0
@@ -1822,129 +1896,153 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_val(i1 zeroext %0, half %1, <1
 ; CHECK-CORTEX-FIX:       @ %bb.0:
 ; CHECK-CORTEX-FIX-NEXT:    .save {r4, r5, r6, r7, r11, lr}
 ; CHECK-CORTEX-FIX-NEXT:    push {r4, r5, r6, r7, r11, lr}
-; CHECK-CORTEX-FIX-NEXT:    .vsave {d8}
-; CHECK-CORTEX-FIX-NEXT:    vpush {d8}
+; CHECK-CORTEX-FIX-NEXT:    .vsave {d8, d9}
+; CHECK-CORTEX-FIX-NEXT:    vpush {d8, d9}
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s9, s0
 ; CHECK-CORTEX-FIX-NEXT:    cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT:    beq .LBB37_2
+; CHECK-CORTEX-FIX-NEXT:    beq .LBB37_3
 ; CHECK-CORTEX-FIX-NEXT:  @ %bb.1:
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s0, s9
 ; CHECK-CORTEX-FIX-NEXT:    vld1.64 {d16, d17}, [r1]
 ; CHECK-CORTEX-FIX-NEXT:    vmov r2, s0
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 r3, d16[1]
 ; CHECK-CORTEX-FIX-NEXT:    vmov.16 d16[0], r2
 ; CHECK-CORTEX-FIX-NEXT:    vmov r4, r5, d17
+; CHECK-CORTEX-FIX-NEXT:    lsr lr, r3, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s8, r3
+; CHECK-CORTEX-FIX-NEXT:    vmov s11, lr
 ; CHECK-CORTEX-FIX-NEXT:    lsr r6, r4, #16
 ; CHECK-CORTEX-FIX-NEXT:    lsr r7, r5, #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov s0, r5
 ; CHECK-CORTEX-FIX-NEXT:    vmov s2, r4
-; CHECK-CORTEX-FIX-NEXT:    vmov s14, r7
-; CHECK-CORTEX-FIX-NEXT:    vmov s3, r6
-; CHECK-CORTEX-FIX-NEXT:    vmov r2, r3, d16
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s0
+; CHECK-CORTEX-FIX-NEXT:    vmov s12, r7
+; CHECK-CORTEX-FIX-NEXT:    vmov s1, r6
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s3, s8
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 r2, d16[0]
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s0
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s0, s2
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s14, s14
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s2, s3
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s14, s12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s1, s11
 ; CHECK-CORTEX-FIX-NEXT:    lsr r12, r2, #16
-; CHECK-CORTEX-FIX-NEXT:    lsr lr, r3, #16
-; CHECK-CORTEX-FIX-NEXT:    vmov s8, r3
-; CHECK-CORTEX-FIX-NEXT:    vmov s10, r2
-; CHECK-CORTEX-FIX-NEXT:    vmov s11, lr
-; CHECK-CORTEX-FIX-NEXT:    vmov s13, r12
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s1, s8
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s10
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s3, s11
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s13
+; CHECK-CORTEX-FIX-NEXT:    vmov s13, r2
+; CHECK-CORTEX-FIX-NEXT:    vmov s15, r12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s13
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s15
 ; CHECK-CORTEX-FIX-NEXT:    cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT:    bne .LBB37_3
-; CHECK-CORTEX-FIX-NEXT:    b .LBB37_4
+; CHECK-CORTEX-FIX-NEXT:    bne .LBB37_4
 ; CHECK-CORTEX-FIX-NEXT:  .LBB37_2:
-; CHECK-CORTEX-FIX-NEXT:    ldrh r12, [r1]
-; CHECK-CORTEX-FIX-NEXT:    ldrh lr, [r1, #2]
-; CHECK-CORTEX-FIX-NEXT:    ldrh r7, [r1, #4]
-; CHECK-CORTEX-FIX-NEXT:    ldrh r6, [r1, #6]
-; CHECK-CORTEX-FIX-NEXT:    ldrh r5, [r1, #8]
-; CHECK-CORTEX-FIX-NEXT:    ldrh r4, [r1, #10]
-; CHECK-CORTEX-FIX-NEXT:    ldrh r2, [r1, #12]
-; CHECK-CORTEX-FIX-NEXT:    ldrh r3, [r1, #14]
-; CHECK-CORTEX-FIX-NEXT:    vmov s0, r3
-; CHECK-CORTEX-FIX-NEXT:    vmov s2, r2
+; CHECK-CORTEX-FIX-NEXT:    vmov r6, r5, d3
+; CHECK-CORTEX-FIX-NEXT:    vmov r0, r2, d2
+; CHECK-CORTEX-FIX-NEXT:    lsr r7, r2, #16
+; CHECK-CORTEX-FIX-NEXT:    lsr r4, r6, #16
+; CHECK-CORTEX-FIX-NEXT:    lsr r3, r5, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s6, r6
+; CHECK-CORTEX-FIX-NEXT:    vmov s4, r5
+; CHECK-CORTEX-FIX-NEXT:    vmov s7, r2
+; CHECK-CORTEX-FIX-NEXT:    vmov s9, r0
+; CHECK-CORTEX-FIX-NEXT:    lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT:    vmov s15, r4
+; CHECK-CORTEX-FIX-NEXT:    vmov s16, r7
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s5, s4
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s11, s7
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s7, s9
+; CHECK-CORTEX-FIX-NEXT:    vmov s4, r12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s6, s6
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s9, s15
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s15, s16
+; CHECK-CORTEX-FIX-NEXT:    b .LBB37_5
+; CHECK-CORTEX-FIX-NEXT:  .LBB37_3:
+; CHECK-CORTEX-FIX-NEXT:    ldrh r12, [r1]
+; CHECK-CORTEX-FIX-NEXT:    ldrh lr, [r1, #2]
+; CHECK-CORTEX-FIX-NEXT:    ldrh r7, [r1, #4]
+; CHECK-CORTEX-FIX-NEXT:    ldrh r6, [r1, #6]
+; CHECK-CORTEX-FIX-NEXT:    ldrh r5, [r1, #8]
+; CHECK-CORTEX-FIX-NEXT:    ldrh r4, [r1, #10]
+; CHECK-CORTEX-FIX-NEXT:    ldrh r2, [r1, #12]
+; CHECK-CORTEX-FIX-NEXT:    ldrh r3, [r1, #14]
 ; CHECK-CORTEX-FIX-NEXT:    vmov s1, r6
+; CHECK-CORTEX-FIX-NEXT:    vmov s3, r7
+; CHECK-CORTEX-FIX-NEXT:    vmov s0, r3
+; CHECK-CORTEX-FIX-NEXT:    vmov s2, r2
 ; CHECK-CORTEX-FIX-NEXT:    vmov s8, r4
-; CHECK-CORTEX-FIX-NEXT:    vmov s10, r5
-; CHECK-CORTEX-FIX-NEXT:    vmov s11, r7
-; CHECK-CORTEX-FIX-NEXT:    vmov s13, lr
-; CHECK-CORTEX-FIX-NEXT:    vmov s15, r12
+; CHECK-CORTEX-FIX-NEXT:    vmov s12, r5
+; CHECK-CORTEX-FIX-NEXT:    vmov s11, lr
+; CHECK-CORTEX-FIX-NEXT:    vmov s13, r12
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s14, s0
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s2
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s2
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s2, s8
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s0, s10
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s3, s1
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s1, s11
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s13
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s15
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s0, s12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s1, s1
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s3, s3
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s11
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s13
 ; CHECK-CORTEX-FIX-NEXT:    cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT:    beq .LBB37_4
-; CHECK-CORTEX-FIX-NEXT:  .LBB37_3:
+; CHECK-CORTEX-FIX-NEXT:    beq .LBB37_2
+; CHECK-CORTEX-FIX-NEXT:  .LBB37_4:
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s9, s9
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 r2, d2[1]
 ; CHECK-CORTEX-FIX-NEXT:    vmov r0, s9
+; CHECK-CORTEX-FIX-NEXT:    lsr r7, r2, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s16, r7
 ; CHECK-CORTEX-FIX-NEXT:    vmov.16 d2[0], r0
-; CHECK-CORTEX-FIX-NEXT:  .LBB37_4:
 ; CHECK-CORTEX-FIX-NEXT:    vmov r6, r5, d3
-; CHECK-CORTEX-FIX-NEXT:    vmov r0, r2, d2
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s12, s12
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s14
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s1, s1
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s2, s2
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s8, s8
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s10, s10
-; CHECK-CORTEX-FIX-NEXT:    lsr r3, r5, #16
-; CHECK-CORTEX-FIX-NEXT:    vmov s4, r5
-; CHECK-CORTEX-FIX-NEXT:    lsr r7, r2, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s7, r2
 ; CHECK-CORTEX-FIX-NEXT:    lsr r4, r6, #16
-; CHECK-CORTEX-FIX-NEXT:    vmov s5, r2
-; CHECK-CORTEX-FIX-NEXT:    lsr r12, r0, #16
-; CHECK-CORTEX-FIX-NEXT:    vmov s9, r0
-; CHECK-CORTEX-FIX-NEXT:    vmov r0, s12
-; CHECK-CORTEX-FIX-NEXT:    vmov r2, s14
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s12, s3
+; CHECK-CORTEX-FIX-NEXT:    lsr r3, r5, #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov s6, r6
-; CHECK-CORTEX-FIX-NEXT:    vmov r5, s2
-; CHECK-CORTEX-FIX-NEXT:    vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s11, s7
+; CHECK-CORTEX-FIX-NEXT:    vmov s9, r3
 ; CHECK-CORTEX-FIX-NEXT:    vmov s15, r4
-; CHECK-CORTEX-FIX-NEXT:    vmov s16, r7
-; CHECK-CORTEX-FIX-NEXT:    vmov r4, s10
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s7, s4
-; CHECK-CORTEX-FIX-NEXT:    vmov s4, r12
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s11, s5
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s5, s9
-; CHECK-CORTEX-FIX-NEXT:    pkhbt r12, r0, r2, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r2, s1
-; CHECK-CORTEX-FIX-NEXT:    vmov r3, s12
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 r0, d2[0]
+; CHECK-CORTEX-FIX-NEXT:    vmov s4, r5
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s6, s6
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s13, s9
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s9, s15
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s15, s16
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s7
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s7, s11
-; CHECK-CORTEX-FIX-NEXT:    pkhbt lr, r2, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s5, s4
+; CHECK-CORTEX-FIX-NEXT:    vmov s18, r0
+; CHECK-CORTEX-FIX-NEXT:    lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s4, r12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s7, s18
+; CHECK-CORTEX-FIX-NEXT:  .LBB37_5:
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s10, s10
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s14
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s3, s3
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s2, s2
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s8, s8
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s12, s12
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s4, s4
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s6, s6
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s5, s5
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s3, s13
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s11, s15
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s9, s9
-; CHECK-CORTEX-FIX-NEXT:    vmov r3, s14
-; CHECK-CORTEX-FIX-NEXT:    vmov r7, s3
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s7, s7
+; CHECK-CORTEX-FIX-NEXT:    vmov r0, s10
+; CHECK-CORTEX-FIX-NEXT:    vmov r2, s14
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s10, s1
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s5
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s1, s13
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s5, s11
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s11, s15
+; CHECK-CORTEX-FIX-NEXT:    vmov r5, s2
+; CHECK-CORTEX-FIX-NEXT:    vmov r4, s8
+; CHECK-CORTEX-FIX-NEXT:    pkhbt r12, r0, r2, lsl #16
+; CHECK-CORTEX-FIX-NEXT:    vmov r2, s3
+; CHECK-CORTEX-FIX-NEXT:    vmov r3, s10
+; CHECK-CORTEX-FIX-NEXT:    vmov r7, s1
 ; CHECK-CORTEX-FIX-NEXT:    vmov r6, s11
 ; CHECK-CORTEX-FIX-NEXT:    vmov r0, s9
+; CHECK-CORTEX-FIX-NEXT:    pkhbt lr, r2, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT:    vmov r3, s14
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r3, r3, r7, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r7, s7
+; CHECK-CORTEX-FIX-NEXT:    vmov r7, s5
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r7, r7, r6, lsl #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov r6, s0
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r6, r6, r5, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r5, s8
+; CHECK-CORTEX-FIX-NEXT:    vmov r5, s12
 ; CHECK-CORTEX-FIX-NEXT:    vmov r2, s0
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r5, r5, r4, lsl #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov r4, s6
@@ -1953,7 +2051,7 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_val(i1 zeroext %0, half %1, <1
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d18[1], lr
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d19[1], r12
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r0, r4, r0, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r4, s5
+; CHECK-CORTEX-FIX-NEXT:    vmov r4, s7
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r2, r4, r2, lsl #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d16[0], r2
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d17[0], r0
@@ -1962,7 +2060,7 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_val(i1 zeroext %0, half %1, <1
 ; CHECK-CORTEX-FIX-NEXT:    aese.8 q9, q8
 ; CHECK-CORTEX-FIX-NEXT:    aesmc.8 q8, q9
 ; CHECK-CORTEX-FIX-NEXT:    vst1.64 {d16, d17}, [r1]
-; CHECK-CORTEX-FIX-NEXT:    vpop {d8}
+; CHECK-CORTEX-FIX-NEXT:    vpop {d8, d9}
 ; CHECK-CORTEX-FIX-NEXT:    pop {r4, r5, r6, r7, r11, pc}
   br i1 %0, label %5, label %11
 
@@ -3726,54 +3824,77 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
 ; CHECK-FIX-NOSCHED-NEXT:    .save {r4, r5, r6, r7, r8, lr}
 ; CHECK-FIX-NOSCHED-NEXT:    push {r4, r5, r6, r7, r8, lr}
 ; CHECK-FIX-NOSCHED-NEXT:    cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT:    beq .LBB82_2
+; CHECK-FIX-NOSCHED-NEXT:    beq .LBB82_3
 ; CHECK-FIX-NOSCHED-NEXT:  @ %bb.1:
 ; CHECK-FIX-NOSCHED-NEXT:    vld1.64 {d16, d17}, [r2]
-; CHECK-FIX-NOSCHED-NEXT:    vld1.16 {d16[0]}, [r1:16]
-; CHECK-FIX-NOSCHED-NEXT:    vmov r3, r4, d17
-; CHECK-FIX-NOSCHED-NEXT:    vmov lr, r12, d16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r3
-; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT:    vorr q9, q8, q8
+; CHECK-FIX-NOSCHED-NEXT:    vmov lr, r12, d17
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r3, d16[1]
+; CHECK-FIX-NOSCHED-NEXT:    vld1.16 {d18[0]}, [r1:16]
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r4, d18[0]
+; CHECK-FIX-NOSCHED-NEXT:    vmov s6, lr
+; CHECK-FIX-NOSCHED-NEXT:    lsr r5, r3, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s8, s6
-; CHECK-FIX-NOSCHED-NEXT:    vmov s4, r4
-; CHECK-FIX-NOSCHED-NEXT:    lsr r4, r4, #16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s10, r3
-; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r12
-; CHECK-FIX-NOSCHED-NEXT:    lsr r12, r12, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r3
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s12, s6
-; CHECK-FIX-NOSCHED-NEXT:    vmov s6, lr
+; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r12, #16
 ; CHECK-FIX-NOSCHED-NEXT:    lsr lr, lr, #16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s14, r12
-; CHECK-FIX-NOSCHED-NEXT:    vmov s7, lr
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s5, s6
-; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r4
+; CHECK-FIX-NOSCHED-NEXT:    vmov s4, r12
+; CHECK-FIX-NOSCHED-NEXT:    vmov s10, lr
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s4, s4
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT:    vmov s14, r5
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s10, s10
+; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r4
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s14, s14
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s5, s6
+; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r3
+; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r4, #16
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT:    vmov s7, r3
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s7, s7
 ; CHECK-FIX-NOSCHED-NEXT:    cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT:    bne .LBB82_3
-; CHECK-FIX-NOSCHED-NEXT:    b .LBB82_4
+; CHECK-FIX-NOSCHED-NEXT:    bne .LBB82_4
 ; CHECK-FIX-NOSCHED-NEXT:  .LBB82_2:
-; CHECK-FIX-NOSCHED-NEXT:    ldrh r4, [r2, #10]
-; CHECK-FIX-NOSCHED-NEXT:    ldrh lr, [r2, #6]
+; CHECK-FIX-NOSCHED-NEXT:    vmov r0, r1, d0
+; CHECK-FIX-NOSCHED-NEXT:    vmov r3, r7, d1
+; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r1
+; CHECK-FIX-NOSCHED-NEXT:    lsr r1, r1, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s0, r7
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s9, s1
+; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r3
+; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r0
+; CHECK-FIX-NOSCHED-NEXT:    lsr r7, r7, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s3, r3
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s11, s1
+; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r7
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FIX-NOSCHED-NEXT:    vmov s13, r1
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-FIX-NOSCHED-NEXT:    lsr r0, r0, #16
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s1, s1
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT:    b .LBB82_5
+; CHECK-FIX-NOSCHED-NEXT:  .LBB82_3:
+; CHECK-FIX-NOSCHED-NEXT:    ldrh r5, [r2, #10]
+; CHECK-FIX-NOSCHED-NEXT:    ldrh r12, [r2, #6]
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r6, [r2, #2]
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r7, [r2, #14]
-; CHECK-FIX-NOSCHED-NEXT:    vmov s8, r4
+; CHECK-FIX-NOSCHED-NEXT:    vmov s8, r5
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r3, [r2, #12]
-; CHECK-FIX-NOSCHED-NEXT:    vmov s12, lr
-; CHECK-FIX-NOSCHED-NEXT:    ldrh r12, [r2, #8]
+; CHECK-FIX-NOSCHED-NEXT:    vmov s12, r12
+; CHECK-FIX-NOSCHED-NEXT:    ldrh r4, [r2, #8]
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r6
-; CHECK-FIX-NOSCHED-NEXT:    ldrh r5, [r2, #4]
+; CHECK-FIX-NOSCHED-NEXT:    ldrh lr, [r2, #4]
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s4, r7
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r8, [r2]
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s6, s4
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s4, r3
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s10, s8
-; CHECK-FIX-NOSCHED-NEXT:    vmov s8, r12
+; CHECK-FIX-NOSCHED-NEXT:    vmov s8, r4
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s14, s12
-; CHECK-FIX-NOSCHED-NEXT:    vmov s12, r5
+; CHECK-FIX-NOSCHED-NEXT:    vmov s12, lr
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s7, s5
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r8
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s4, s4
@@ -3781,44 +3902,46 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s12, s12
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s5, s5
 ; CHECK-FIX-NOSCHED-NEXT:    cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT:    beq .LBB82_4
-; CHECK-FIX-NOSCHED-NEXT:  .LBB82_3:
-; CHECK-FIX-NOSCHED-NEXT:    vld1.16 {d0[0]}, [r1:16]
+; CHECK-FIX-NOSCHED-NEXT:    beq .LBB82_2
 ; CHECK-FIX-NOSCHED-NEXT:  .LBB82_4:
-; CHECK-FIX-NOSCHED-NEXT:    vmov r0, r1, d0
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s5, s5
-; CHECK-FIX-NOSCHED-NEXT:    vmov r3, r7, d1
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s12, s12
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s8, s8
-; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r1
-; CHECK-FIX-NOSCHED-NEXT:    lsr r1, r1, #16
+; CHECK-FIX-NOSCHED-NEXT:    vorr q8, q0, q0
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r3, d0[1]
+; CHECK-FIX-NOSCHED-NEXT:    vld1.16 {d16[0]}, [r1:16]
+; CHECK-FIX-NOSCHED-NEXT:    vmov r0, r1, d1
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r7, d16[0]
+; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r3
+; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s9, s1
-; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r0
+; CHECK-FIX-NOSCHED-NEXT:    vmov s13, r3
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT:    vmov s0, r1
+; CHECK-FIX-NOSCHED-NEXT:    lsr r1, r1, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r0
 ; CHECK-FIX-NOSCHED-NEXT:    lsr r0, r0, #16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s13, r1
-; CHECK-FIX-NOSCHED-NEXT:    vmov s15, r0
+; CHECK-FIX-NOSCHED-NEXT:    vmov s3, r0
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r7
+; CHECK-FIX-NOSCHED-NEXT:    lsr r0, r7, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s11, s1
+; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r1
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s1, s1
+; CHECK-FIX-NOSCHED-NEXT:  .LBB82_5:
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s5, s5
+; CHECK-FIX-NOSCHED-NEXT:    vmov s15, r0
 ; CHECK-FIX-NOSCHED-NEXT:    vmov r0, s5
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s5, s7
 ; CHECK-FIX-NOSCHED-NEXT:    vmov r1, s5
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s15, s15
-; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r3
-; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s5, s15
-; CHECK-FIX-NOSCHED-NEXT:    vmov s3, r3
-; CHECK-FIX-NOSCHED-NEXT:    vmov r3, s5
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s11, s11
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT:    vmov r3, s5
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s12, s12
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s5, s9
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s2, s2
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s8, s8
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s2, s2
-; CHECK-FIX-NOSCHED-NEXT:    vmov s0, r7
-; CHECK-FIX-NOSCHED-NEXT:    lsr r7, r7, #16
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r7
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s1, s1
 ; CHECK-FIX-NOSCHED-NEXT:    pkhbt r0, r0, r1, lsl #16
 ; CHECK-FIX-NOSCHED-NEXT:    vmov r1, s11
 ; CHECK-FIX-NOSCHED-NEXT:    vmov.32 d16[0], r0
@@ -3864,39 +3987,63 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
 ; CHECK-CORTEX-FIX:       @ %bb.0:
 ; CHECK-CORTEX-FIX-NEXT:    .save {r4, r5, r6, r7, r8, lr}
 ; CHECK-CORTEX-FIX-NEXT:    push {r4, r5, r6, r7, r8, lr}
-; CHECK-CORTEX-FIX-NEXT:    .vsave {d8}
-; CHECK-CORTEX-FIX-NEXT:    vpush {d8}
+; CHECK-CORTEX-FIX-NEXT:    .vsave {d8, d9}
+; CHECK-CORTEX-FIX-NEXT:    vpush {d8, d9}
 ; CHECK-CORTEX-FIX-NEXT:    cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT:    beq .LBB82_2
+; CHECK-CORTEX-FIX-NEXT:    beq .LBB82_3
 ; CHECK-CORTEX-FIX-NEXT:  @ %bb.1:
 ; CHECK-CORTEX-FIX-NEXT:    vld1.64 {d16, d17}, [r2]
-; CHECK-CORTEX-FIX-NEXT:    vld1.16 {d16[0]}, [r1:16]
+; CHECK-CORTEX-FIX-NEXT:    vorr q9, q8, q8
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 r3, d16[1]
 ; CHECK-CORTEX-FIX-NEXT:    vmov r5, r6, d17
+; CHECK-CORTEX-FIX-NEXT:    vld1.16 {d18[0]}, [r1:16]
 ; CHECK-CORTEX-FIX-NEXT:    lsr r7, r5, #16
 ; CHECK-CORTEX-FIX-NEXT:    lsr r4, r6, #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov s4, r6
 ; CHECK-CORTEX-FIX-NEXT:    vmov s6, r5
-; CHECK-CORTEX-FIX-NEXT:    vmov s14, r4
-; CHECK-CORTEX-FIX-NEXT:    vmov s7, r7
-; CHECK-CORTEX-FIX-NEXT:    vmov r12, r3, d16
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s4
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s4, s6
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s14, s14
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s6, s7
-; CHECK-CORTEX-FIX-NEXT:    lsr lr, r12, #16
 ; CHECK-CORTEX-FIX-NEXT:    lsr r8, r3, #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov s8, r3
-; CHECK-CORTEX-FIX-NEXT:    vmov s10, r12
+; CHECK-CORTEX-FIX-NEXT:    vmov s12, r4
+; CHECK-CORTEX-FIX-NEXT:    vmov s5, r7
 ; CHECK-CORTEX-FIX-NEXT:    vmov s9, r8
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s4
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s4, s6
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 lr, d18[0]
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s6, s5
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s7, s8
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s14, s12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s5, s9
+; CHECK-CORTEX-FIX-NEXT:    lsr r12, lr, #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov s11, lr
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s5, s8
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s10
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s7, s9
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s11
+; CHECK-CORTEX-FIX-NEXT:    vmov s13, r12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s11
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s13
 ; CHECK-CORTEX-FIX-NEXT:    cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT:    bne .LBB82_3
-; CHECK-CORTEX-FIX-NEXT:    b .LBB82_4
+; CHECK-CORTEX-FIX-NEXT:    bne .LBB82_4
 ; CHECK-CORTEX-FIX-NEXT:  .LBB82_2:
+; CHECK-CORTEX-FIX-NEXT:    vmov r6, r5, d1
+; CHECK-CORTEX-FIX-NEXT:    vmov r0, r1, d0
+; CHECK-CORTEX-FIX-NEXT:    lsr r7, r1, #16
+; CHECK-CORTEX-FIX-NEXT:    lsr r4, r6, #16
+; CHECK-CORTEX-FIX-NEXT:    lsr r3, r5, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s2, r6
+; CHECK-CORTEX-FIX-NEXT:    vmov s0, r5
+; CHECK-CORTEX-FIX-NEXT:    vmov s3, r1
+; CHECK-CORTEX-FIX-NEXT:    vmov s9, r0
+; CHECK-CORTEX-FIX-NEXT:    lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT:    vmov s15, r4
+; CHECK-CORTEX-FIX-NEXT:    vmov s16, r7
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s1, s0
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s11, s3
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s3, s9
+; CHECK-CORTEX-FIX-NEXT:    vmov s0, r12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s9, s15
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s15, s16
+; CHECK-CORTEX-FIX-NEXT:    b .LBB82_5
+; CHECK-CORTEX-FIX-NEXT:  .LBB82_3:
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r12, [r2]
 ; CHECK-CORTEX-FIX-NEXT:    ldrh lr, [r2, #2]
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r8, [r2, #4]
@@ -3905,84 +4052,86 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r3, [r2, #10]
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r7, [r2, #12]
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r6, [r2, #14]
+; CHECK-CORTEX-FIX-NEXT:    vmov s5, r5
+; CHECK-CORTEX-FIX-NEXT:    vmov s7, r8
 ; CHECK-CORTEX-FIX-NEXT:    vmov s4, r6
 ; CHECK-CORTEX-FIX-NEXT:    vmov s6, r7
-; CHECK-CORTEX-FIX-NEXT:    vmov s5, r5
 ; CHECK-CORTEX-FIX-NEXT:    vmov s8, r3
-; CHECK-CORTEX-FIX-NEXT:    vmov s10, r4
-; CHECK-CORTEX-FIX-NEXT:    vmov s9, r8
-; CHECK-CORTEX-FIX-NEXT:    vmov s11, lr
-; CHECK-CORTEX-FIX-NEXT:    vmov s13, r12
+; CHECK-CORTEX-FIX-NEXT:    vmov s12, r4
+; CHECK-CORTEX-FIX-NEXT:    vmov s9, lr
+; CHECK-CORTEX-FIX-NEXT:    vmov s11, r12
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s14, s4
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s6
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s6
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s6, s8
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s4, s10
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s7, s5
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s5, s9
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s11
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s13
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s4, s12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s5, s5
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s7, s7
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s9
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s11
 ; CHECK-CORTEX-FIX-NEXT:    cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT:    beq .LBB82_4
-; CHECK-CORTEX-FIX-NEXT:  .LBB82_3:
-; CHECK-CORTEX-FIX-NEXT:    vld1.16 {d0[0]}, [r1:16]
+; CHECK-CORTEX-FIX-NEXT:    beq .LBB82_2
 ; CHECK-CORTEX-FIX-NEXT:  .LBB82_4:
+; CHECK-CORTEX-FIX-NEXT:    vorr q8, q0, q0
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 r3, d0[1]
 ; CHECK-CORTEX-FIX-NEXT:    vmov r6, r5, d1
-; CHECK-CORTEX-FIX-NEXT:    vmov r0, r1, d0
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s12, s12
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s14
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s5, s5
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s4, s4
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s6, s6
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s8, s8
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s10, s10
-; CHECK-CORTEX-FIX-NEXT:    lsr r3, r5, #16
-; CHECK-CORTEX-FIX-NEXT:    vmov s0, r5
-; CHECK-CORTEX-FIX-NEXT:    lsr r7, r1, #16
+; CHECK-CORTEX-FIX-NEXT:    vld1.16 {d16[0]}, [r1:16]
 ; CHECK-CORTEX-FIX-NEXT:    lsr r4, r6, #16
-; CHECK-CORTEX-FIX-NEXT:    vmov s1, r1
-; CHECK-CORTEX-FIX-NEXT:    lsr r12, r0, #16
-; CHECK-CORTEX-FIX-NEXT:    vmov s9, r0
-; CHECK-CORTEX-FIX-NEXT:    vmov r0, s12
-; CHECK-CORTEX-FIX-NEXT:    vmov r1, s14
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s12, s7
+; CHECK-CORTEX-FIX-NEXT:    lsr r1, r5, #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov s2, r6
-; CHECK-CORTEX-FIX-NEXT:    vmov r5, s6
-; CHECK-CORTEX-FIX-NEXT:    vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT:    vmov s0, r5
+; CHECK-CORTEX-FIX-NEXT:    lsr r7, r3, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s3, r3
+; CHECK-CORTEX-FIX-NEXT:    vmov s9, r1
 ; CHECK-CORTEX-FIX-NEXT:    vmov s15, r4
 ; CHECK-CORTEX-FIX-NEXT:    vmov s16, r7
-; CHECK-CORTEX-FIX-NEXT:    vmov r4, s10
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s3, s0
-; CHECK-CORTEX-FIX-NEXT:    vmov s0, r12
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s11, s1
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s1, s9
-; CHECK-CORTEX-FIX-NEXT:    pkhbt r12, r0, r1, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r1, s5
-; CHECK-CORTEX-FIX-NEXT:    vmov r3, s12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s1, s0
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s2, s2
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 r0, d16[0]
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s13, s9
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s9, s15
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s11, s3
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s15, s16
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s3
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s7, s11
-; CHECK-CORTEX-FIX-NEXT:    pkhbt lr, r1, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s18, r0
+; CHECK-CORTEX-FIX-NEXT:    lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s0, r12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s3, s18
+; CHECK-CORTEX-FIX-NEXT:  .LBB82_5:
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s10, s10
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s14
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s7, s7
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s6, s6
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s4, s4
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s8, s8
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s12, s12
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s0, s0
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s2, s2
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s1, s1
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s3, s13
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s11, s15
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s9, s9
-; CHECK-CORTEX-FIX-NEXT:    vmov r3, s14
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s3, s3
+; CHECK-CORTEX-FIX-NEXT:    vmov r0, s10
+; CHECK-CORTEX-FIX-NEXT:    vmov r1, s14
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s10, s5
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s1
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s1, s13
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s5, s11
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s11, s15
+; CHECK-CORTEX-FIX-NEXT:    vmov r5, s6
+; CHECK-CORTEX-FIX-NEXT:    vmov r4, s8
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-CORTEX-FIX-NEXT:    vmov r7, s3
+; CHECK-CORTEX-FIX-NEXT:    pkhbt r12, r0, r1, lsl #16
+; CHECK-CORTEX-FIX-NEXT:    vmov r1, s7
+; CHECK-CORTEX-FIX-NEXT:    vmov r3, s10
+; CHECK-CORTEX-FIX-NEXT:    vmov r7, s1
 ; CHECK-CORTEX-FIX-NEXT:    vmov r6, s11
 ; CHECK-CORTEX-FIX-NEXT:    vmov r0, s9
+; CHECK-CORTEX-FIX-NEXT:    pkhbt lr, r1, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT:    vmov r3, s14
 ; CHECK-CORTEX-FIX-NEXT:    vmov r1, s0
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r3, r3, r7, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r7, s7
+; CHECK-CORTEX-FIX-NEXT:    vmov r7, s5
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r7, r7, r6, lsl #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov r6, s4
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r6, r6, r5, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r5, s8
+; CHECK-CORTEX-FIX-NEXT:    vmov r5, s12
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r5, r5, r4, lsl #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov r4, s2
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d18[0], r5
@@ -3990,7 +4139,7 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d18[1], lr
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d19[1], r12
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r0, r4, r0, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r4, s1
+; CHECK-CORTEX-FIX-NEXT:    vmov r4, s3
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r1, r4, r1, lsl #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d16[0], r1
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d17[0], r0
@@ -3999,7 +4148,7 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, <
 ; CHECK-CORTEX-FIX-NEXT:    aesd.8 q9, q8
 ; CHECK-CORTEX-FIX-NEXT:    aesimc.8 q8, q9
 ; CHECK-CORTEX-FIX-NEXT:    vst1.64 {d16, d17}, [r2]
-; CHECK-CORTEX-FIX-NEXT:    vpop {d8}
+; CHECK-CORTEX-FIX-NEXT:    vpop {d8, d9}
 ; CHECK-CORTEX-FIX-NEXT:    pop {r4, r5, r6, r7, r8, pc}
   br i1 %0, label %5, label %12
 
@@ -4050,56 +4199,78 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_val(i1 zeroext %0, half %1, <1
 ; CHECK-FIX-NOSCHED-NEXT:    push {r4, r5, r6, r7, r11, lr}
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s9, s0
 ; CHECK-FIX-NOSCHED-NEXT:    cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT:    beq .LBB83_2
+; CHECK-FIX-NOSCHED-NEXT:    beq .LBB83_3
 ; CHECK-FIX-NOSCHED-NEXT:  @ %bb.1:
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s0, s9
 ; CHECK-FIX-NOSCHED-NEXT:    vld1.64 {d16, d17}, [r1]
 ; CHECK-FIX-NOSCHED-NEXT:    vmov r2, s0
+; CHECK-FIX-NOSCHED-NEXT:    vmov lr, r12, d17
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r3, d16[1]
 ; CHECK-FIX-NOSCHED-NEXT:    vmov.16 d16[0], r2
-; CHECK-FIX-NOSCHED-NEXT:    vmov r3, lr, d17
-; CHECK-FIX-NOSCHED-NEXT:    vmov r2, r12, d16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r3
-; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s2, lr
+; CHECK-FIX-NOSCHED-NEXT:    lsr lr, lr, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s8, s2
-; CHECK-FIX-NOSCHED-NEXT:    lsr r4, lr, #16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s0, lr
-; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r12
-; CHECK-FIX-NOSCHED-NEXT:    lsr r5, r2, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r3
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r2, d16[0]
+; CHECK-FIX-NOSCHED-NEXT:    lsr r4, r3, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s12, s2
-; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r2
-; CHECK-FIX-NOSCHED-NEXT:    lsr r2, r12, #16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s10, r3
-; CHECK-FIX-NOSCHED-NEXT:    vmov s14, r2
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s1, s2
-; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r4
+; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r12, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s0, r12
+; CHECK-FIX-NOSCHED-NEXT:    vmov s10, lr
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-FIX-NOSCHED-NEXT:    vmov s3, r5
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-FIX-NOSCHED-NEXT:    vmov s14, r4
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s10, s10
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s14, s14
+; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r2
+; CHECK-FIX-NOSCHED-NEXT:    lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s3, r2
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s1, s2
+; CHECK-FIX-NOSCHED-NEXT:    vmov s2, r3
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s3, s3
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s2, s2
 ; CHECK-FIX-NOSCHED-NEXT:    cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT:    bne .LBB83_3
-; CHECK-FIX-NOSCHED-NEXT:    b .LBB83_4
+; CHECK-FIX-NOSCHED-NEXT:    bne .LBB83_4
 ; CHECK-FIX-NOSCHED-NEXT:  .LBB83_2:
+; CHECK-FIX-NOSCHED-NEXT:    vmov r0, r2, d2
+; CHECK-FIX-NOSCHED-NEXT:    vmov r3, r7, d3
+; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r2
+; CHECK-FIX-NOSCHED-NEXT:    lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s4, r7
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s9, s5
+; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r3
+; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r0
+; CHECK-FIX-NOSCHED-NEXT:    lsr r7, r7, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s7, r3
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s11, s5
+; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r7
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s4, s4
+; CHECK-FIX-NOSCHED-NEXT:    vmov s13, r2
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT:    lsr r0, r0, #16
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s5, s5
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s7, s7
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT:    b .LBB83_5
+; CHECK-FIX-NOSCHED-NEXT:  .LBB83_3:
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r3, [r1, #10]
-; CHECK-FIX-NOSCHED-NEXT:    ldrh r4, [r1, #6]
-; CHECK-FIX-NOSCHED-NEXT:    ldrh lr, [r1, #2]
+; CHECK-FIX-NOSCHED-NEXT:    ldrh r12, [r1, #6]
+; CHECK-FIX-NOSCHED-NEXT:    ldrh r5, [r1, #2]
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r7, [r1, #14]
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s8, r3
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r2, [r1, #12]
-; CHECK-FIX-NOSCHED-NEXT:    vmov s12, r4
-; CHECK-FIX-NOSCHED-NEXT:    ldrh r5, [r1, #8]
-; CHECK-FIX-NOSCHED-NEXT:    vmov s1, lr
-; CHECK-FIX-NOSCHED-NEXT:    ldrh r12, [r1, #4]
+; CHECK-FIX-NOSCHED-NEXT:    vmov s12, r12
+; CHECK-FIX-NOSCHED-NEXT:    ldrh r4, [r1, #8]
+; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r5
+; CHECK-FIX-NOSCHED-NEXT:    ldrh lr, [r1, #4]
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s0, r7
 ; CHECK-FIX-NOSCHED-NEXT:    ldrh r6, [r1]
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s2, s0
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s0, r2
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s10, s8
-; CHECK-FIX-NOSCHED-NEXT:    vmov s8, r5
+; CHECK-FIX-NOSCHED-NEXT:    vmov s8, r4
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s14, s12
-; CHECK-FIX-NOSCHED-NEXT:    vmov s12, r12
+; CHECK-FIX-NOSCHED-NEXT:    vmov s12, lr
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s3, s1
 ; CHECK-FIX-NOSCHED-NEXT:    vmov s1, r6
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s0, s0
@@ -4107,47 +4278,48 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_val(i1 zeroext %0, half %1, <1
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s12, s12
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s1, s1
 ; CHECK-FIX-NOSCHED-NEXT:    cmp r0, #0
-; CHECK-FIX-NOSCHED-NEXT:    beq .LBB83_4
-; CHECK-FIX-NOSCHED-NEXT:  .LBB83_3:
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s9, s9
-; CHECK-FIX-NOSCHED-NEXT:    vmov r0, s9
-; CHECK-FIX-NOSCHED-NEXT:    vmov.16 d2[0], r0
+; CHECK-FIX-NOSCHED-NEXT:    beq .LBB83_2
 ; CHECK-FIX-NOSCHED-NEXT:  .LBB83_4:
-; CHECK-FIX-NOSCHED-NEXT:    vmov r0, r2, d2
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s1, s1
-; CHECK-FIX-NOSCHED-NEXT:    vmov r3, r7, d3
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s12, s12
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s8, s8
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r2
-; CHECK-FIX-NOSCHED-NEXT:    lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s9, s9
+; CHECK-FIX-NOSCHED-NEXT:    vmov r0, r2, d3
+; CHECK-FIX-NOSCHED-NEXT:    vmov r7, s9
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r3, d2[1]
+; CHECK-FIX-NOSCHED-NEXT:    vmov.16 d2[0], r7
+; CHECK-FIX-NOSCHED-NEXT:    vmov.32 r7, d2[0]
+; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r3
+; CHECK-FIX-NOSCHED-NEXT:    vmov s4, r2
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s9, s5
-; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r0
+; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r0
 ; CHECK-FIX-NOSCHED-NEXT:    lsr r0, r0, #16
-; CHECK-FIX-NOSCHED-NEXT:    vmov s13, r2
-; CHECK-FIX-NOSCHED-NEXT:    vmov s15, r0
+; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
+; CHECK-FIX-NOSCHED-NEXT:    lsr r2, r2, #16
+; CHECK-FIX-NOSCHED-NEXT:    vmov s7, r0
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s4, s4
+; CHECK-FIX-NOSCHED-NEXT:    vmov s13, r3
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s6, s6
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s7, s7
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r7
+; CHECK-FIX-NOSCHED-NEXT:    lsr r0, r7, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s11, s5
+; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r2
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s5, s5
+; CHECK-FIX-NOSCHED-NEXT:  .LBB83_5:
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s1, s1
+; CHECK-FIX-NOSCHED-NEXT:    vmov s15, r0
 ; CHECK-FIX-NOSCHED-NEXT:    vmov r0, s1
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s1, s3
 ; CHECK-FIX-NOSCHED-NEXT:    vmov r2, s1
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s15, s15
-; CHECK-FIX-NOSCHED-NEXT:    vmov s6, r3
-; CHECK-FIX-NOSCHED-NEXT:    lsr r3, r3, #16
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s1, s15
-; CHECK-FIX-NOSCHED-NEXT:    vmov s7, r3
-; CHECK-FIX-NOSCHED-NEXT:    vmov r3, s1
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s11, s11
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-FIX-NOSCHED-NEXT:    vmov r3, s1
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s12, s12
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s1, s9
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s6, s6
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s7, s7
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s8, s8
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s6, s6
-; CHECK-FIX-NOSCHED-NEXT:    vmov s4, r7
-; CHECK-FIX-NOSCHED-NEXT:    lsr r7, r7, #16
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s4, s4
-; CHECK-FIX-NOSCHED-NEXT:    vmov s5, r7
 ; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s4, s4
-; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f32.f16 s5, s5
+; CHECK-FIX-NOSCHED-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-FIX-NOSCHED-NEXT:    pkhbt r0, r0, r2, lsl #16
 ; CHECK-FIX-NOSCHED-NEXT:    vmov r2, s11
 ; CHECK-FIX-NOSCHED-NEXT:    vmov.32 d16[0], r0
@@ -4192,42 +4364,65 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_val(i1 zeroext %0, half %1, <1
 ; CHECK-CORTEX-FIX:       @ %bb.0:
 ; CHECK-CORTEX-FIX-NEXT:    .save {r4, r5, r6, r7, r11, lr}
 ; CHECK-CORTEX-FIX-NEXT:    push {r4, r5, r6, r7, r11, lr}
-; CHECK-CORTEX-FIX-NEXT:    .vsave {d8}
-; CHECK-CORTEX-FIX-NEXT:    vpush {d8}
+; CHECK-CORTEX-FIX-NEXT:    .vsave {d8, d9}
+; CHECK-CORTEX-FIX-NEXT:    vpush {d8, d9}
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s9, s0
 ; CHECK-CORTEX-FIX-NEXT:    cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT:    beq .LBB83_2
+; CHECK-CORTEX-FIX-NEXT:    beq .LBB83_3
 ; CHECK-CORTEX-FIX-NEXT:  @ %bb.1:
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s0, s9
 ; CHECK-CORTEX-FIX-NEXT:    vld1.64 {d16, d17}, [r1]
 ; CHECK-CORTEX-FIX-NEXT:    vmov r2, s0
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 r3, d16[1]
 ; CHECK-CORTEX-FIX-NEXT:    vmov.16 d16[0], r2
 ; CHECK-CORTEX-FIX-NEXT:    vmov r4, r5, d17
+; CHECK-CORTEX-FIX-NEXT:    lsr lr, r3, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s8, r3
+; CHECK-CORTEX-FIX-NEXT:    vmov s11, lr
 ; CHECK-CORTEX-FIX-NEXT:    lsr r6, r4, #16
 ; CHECK-CORTEX-FIX-NEXT:    lsr r7, r5, #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov s0, r5
 ; CHECK-CORTEX-FIX-NEXT:    vmov s2, r4
-; CHECK-CORTEX-FIX-NEXT:    vmov s14, r7
-; CHECK-CORTEX-FIX-NEXT:    vmov s3, r6
-; CHECK-CORTEX-FIX-NEXT:    vmov r2, r3, d16
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s0
+; CHECK-CORTEX-FIX-NEXT:    vmov s12, r7
+; CHECK-CORTEX-FIX-NEXT:    vmov s1, r6
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s3, s8
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 r2, d16[0]
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s0
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s0, s2
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s14, s14
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s2, s3
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s14, s12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s1, s11
 ; CHECK-CORTEX-FIX-NEXT:    lsr r12, r2, #16
-; CHECK-CORTEX-FIX-NEXT:    lsr lr, r3, #16
-; CHECK-CORTEX-FIX-NEXT:    vmov s8, r3
-; CHECK-CORTEX-FIX-NEXT:    vmov s10, r2
-; CHECK-CORTEX-FIX-NEXT:    vmov s11, lr
-; CHECK-CORTEX-FIX-NEXT:    vmov s13, r12
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s1, s8
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s10
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s3, s11
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s13
+; CHECK-CORTEX-FIX-NEXT:    vmov s13, r2
+; CHECK-CORTEX-FIX-NEXT:    vmov s15, r12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s13
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s15
 ; CHECK-CORTEX-FIX-NEXT:    cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT:    bne .LBB83_3
-; CHECK-CORTEX-FIX-NEXT:    b .LBB83_4
+; CHECK-CORTEX-FIX-NEXT:    bne .LBB83_4
 ; CHECK-CORTEX-FIX-NEXT:  .LBB83_2:
+; CHECK-CORTEX-FIX-NEXT:    vmov r6, r5, d3
+; CHECK-CORTEX-FIX-NEXT:    vmov r0, r2, d2
+; CHECK-CORTEX-FIX-NEXT:    lsr r7, r2, #16
+; CHECK-CORTEX-FIX-NEXT:    lsr r4, r6, #16
+; CHECK-CORTEX-FIX-NEXT:    lsr r3, r5, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s6, r6
+; CHECK-CORTEX-FIX-NEXT:    vmov s4, r5
+; CHECK-CORTEX-FIX-NEXT:    vmov s7, r2
+; CHECK-CORTEX-FIX-NEXT:    vmov s9, r0
+; CHECK-CORTEX-FIX-NEXT:    lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT:    vmov s15, r4
+; CHECK-CORTEX-FIX-NEXT:    vmov s16, r7
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s5, s4
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s11, s7
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s7, s9
+; CHECK-CORTEX-FIX-NEXT:    vmov s4, r12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s6, s6
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s9, s15
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s15, s16
+; CHECK-CORTEX-FIX-NEXT:    b .LBB83_5
+; CHECK-CORTEX-FIX-NEXT:  .LBB83_3:
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r12, [r1]
 ; CHECK-CORTEX-FIX-NEXT:    ldrh lr, [r1, #2]
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r7, [r1, #4]
@@ -4236,85 +4431,86 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_val(i1 zeroext %0, half %1, <1
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r4, [r1, #10]
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r2, [r1, #12]
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r3, [r1, #14]
+; CHECK-CORTEX-FIX-NEXT:    vmov s1, r6
+; CHECK-CORTEX-FIX-NEXT:    vmov s3, r7
 ; CHECK-CORTEX-FIX-NEXT:    vmov s0, r3
 ; CHECK-CORTEX-FIX-NEXT:    vmov s2, r2
-; CHECK-CORTEX-FIX-NEXT:    vmov s1, r6
 ; CHECK-CORTEX-FIX-NEXT:    vmov s8, r4
-; CHECK-CORTEX-FIX-NEXT:    vmov s10, r5
-; CHECK-CORTEX-FIX-NEXT:    vmov s11, r7
-; CHECK-CORTEX-FIX-NEXT:    vmov s13, lr
-; CHECK-CORTEX-FIX-NEXT:    vmov s15, r12
+; CHECK-CORTEX-FIX-NEXT:    vmov s12, r5
+; CHECK-CORTEX-FIX-NEXT:    vmov s11, lr
+; CHECK-CORTEX-FIX-NEXT:    vmov s13, r12
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s14, s0
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s2
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s2
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s2, s8
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s0, s10
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s3, s1
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s1, s11
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s10, s13
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s15
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s0, s12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s1, s1
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s3, s3
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s8, s11
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s12, s13
 ; CHECK-CORTEX-FIX-NEXT:    cmp r0, #0
-; CHECK-CORTEX-FIX-NEXT:    beq .LBB83_4
-; CHECK-CORTEX-FIX-NEXT:  .LBB83_3:
+; CHECK-CORTEX-FIX-NEXT:    beq .LBB83_2
+; CHECK-CORTEX-FIX-NEXT:  .LBB83_4:
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s9, s9
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 r2, d2[1]
 ; CHECK-CORTEX-FIX-NEXT:    vmov r0, s9
+; CHECK-CORTEX-FIX-NEXT:    lsr r7, r2, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s16, r7
 ; CHECK-CORTEX-FIX-NEXT:    vmov.16 d2[0], r0
-; CHECK-CORTEX-FIX-NEXT:  .LBB83_4:
 ; CHECK-CORTEX-FIX-NEXT:    vmov r6, r5, d3
-; CHECK-CORTEX-FIX-NEXT:    vmov r0, r2, d2
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s12, s12
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s14
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s1, s1
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s2, s2
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s8, s8
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s10, s10
-; CHECK-CORTEX-FIX-NEXT:    lsr r3, r5, #16
-; CHECK-CORTEX-FIX-NEXT:    vmov s4, r5
-; CHECK-CORTEX-FIX-NEXT:    lsr r7, r2, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s7, r2
 ; CHECK-CORTEX-FIX-NEXT:    lsr r4, r6, #16
-; CHECK-CORTEX-FIX-NEXT:    vmov s5, r2
-; CHECK-CORTEX-FIX-NEXT:    lsr r12, r0, #16
-; CHECK-CORTEX-FIX-NEXT:    vmov s9, r0
-; CHECK-CORTEX-FIX-NEXT:    vmov r0, s12
-; CHECK-CORTEX-FIX-NEXT:    vmov r2, s14
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s12, s3
+; CHECK-CORTEX-FIX-NEXT:    lsr r3, r5, #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov s6, r6
-; CHECK-CORTEX-FIX-NEXT:    vmov r5, s2
-; CHECK-CORTEX-FIX-NEXT:    vmov s13, r3
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s11, s7
+; CHECK-CORTEX-FIX-NEXT:    vmov s9, r3
 ; CHECK-CORTEX-FIX-NEXT:    vmov s15, r4
-; CHECK-CORTEX-FIX-NEXT:    vmov s16, r7
-; CHECK-CORTEX-FIX-NEXT:    vmov r4, s10
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s7, s4
-; CHECK-CORTEX-FIX-NEXT:    vmov s4, r12
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s11, s5
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s5, s9
-; CHECK-CORTEX-FIX-NEXT:    pkhbt r12, r0, r2, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r2, s1
-; CHECK-CORTEX-FIX-NEXT:    vmov r3, s12
+; CHECK-CORTEX-FIX-NEXT:    vmov.32 r0, d2[0]
+; CHECK-CORTEX-FIX-NEXT:    vmov s4, r5
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s6, s6
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s13, s13
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s13, s9
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s9, s15
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s15, s16
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s7
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s7, s11
-; CHECK-CORTEX-FIX-NEXT:    pkhbt lr, r2, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s5, s4
+; CHECK-CORTEX-FIX-NEXT:    vmov s18, r0
+; CHECK-CORTEX-FIX-NEXT:    lsr r12, r0, #16
+; CHECK-CORTEX-FIX-NEXT:    vmov s4, r12
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s7, s18
+; CHECK-CORTEX-FIX-NEXT:  .LBB83_5:
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s10, s10
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s14
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s3, s3
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s2, s2
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s8, s8
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s12, s12
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f32.f16 s4, s4
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s6, s6
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s5, s5
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s3, s13
-; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s11, s15
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s9, s9
-; CHECK-CORTEX-FIX-NEXT:    vmov r3, s14
-; CHECK-CORTEX-FIX-NEXT:    vmov r7, s3
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s7, s7
+; CHECK-CORTEX-FIX-NEXT:    vmov r0, s10
+; CHECK-CORTEX-FIX-NEXT:    vmov r2, s14
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s10, s1
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s14, s5
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s1, s13
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s5, s11
+; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s11, s15
+; CHECK-CORTEX-FIX-NEXT:    vmov r5, s2
+; CHECK-CORTEX-FIX-NEXT:    vmov r4, s8
+; CHECK-CORTEX-FIX-NEXT:    pkhbt r12, r0, r2, lsl #16
+; CHECK-CORTEX-FIX-NEXT:    vmov r2, s3
+; CHECK-CORTEX-FIX-NEXT:    vmov r3, s10
+; CHECK-CORTEX-FIX-NEXT:    vmov r7, s1
 ; CHECK-CORTEX-FIX-NEXT:    vmov r6, s11
 ; CHECK-CORTEX-FIX-NEXT:    vmov r0, s9
+; CHECK-CORTEX-FIX-NEXT:    pkhbt lr, r2, r3, lsl #16
+; CHECK-CORTEX-FIX-NEXT:    vmov r3, s14
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r3, r3, r7, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r7, s7
+; CHECK-CORTEX-FIX-NEXT:    vmov r7, s5
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r7, r7, r6, lsl #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov r6, s0
 ; CHECK-CORTEX-FIX-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r6, r6, r5, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r5, s8
+; CHECK-CORTEX-FIX-NEXT:    vmov r5, s12
 ; CHECK-CORTEX-FIX-NEXT:    vmov r2, s0
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r5, r5, r4, lsl #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov r4, s6
@@ -4323,7 +4519,7 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_val(i1 zeroext %0, half %1, <1
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d18[1], lr
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d19[1], r12
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r0, r4, r0, lsl #16
-; CHECK-CORTEX-FIX-NEXT:    vmov r4, s5
+; CHECK-CORTEX-FIX-NEXT:    vmov r4, s7
 ; CHECK-CORTEX-FIX-NEXT:    pkhbt r2, r4, r2, lsl #16
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d16[0], r2
 ; CHECK-CORTEX-FIX-NEXT:    vmov.32 d17[0], r0
@@ -4332,7 +4528,7 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_val(i1 zeroext %0, half %1, <1
 ; CHECK-CORTEX-FIX-NEXT:    aesd.8 q9, q8
 ; CHECK-CORTEX-FIX-NEXT:    aesimc.8 q8, q9
 ; CHECK-CORTEX-FIX-NEXT:    vst1.64 {d16, d17}, [r1]
-; CHECK-CORTEX-FIX-NEXT:    vpop {d8}
+; CHECK-CORTEX-FIX-NEXT:    vpop {d8, d9}
 ; CHECK-CORTEX-FIX-NEXT:    pop {r4, r5, r6, r7, r11, pc}
   br i1 %0, label %5, label %11
 

diff  --git a/llvm/test/CodeGen/RISCV/rv32zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbp.ll
index 4293b9261f975..e3f824c214ade 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbp.ll
@@ -1110,29 +1110,28 @@ define i64 @gorc2b_i64(i64 %a) nounwind {
 ;
 ; RV32ZBP-LABEL: gorc2b_i64:
 ; RV32ZBP:       # %bb.0:
-; RV32ZBP-NEXT:    srli a2, a1, 2
-; RV32ZBP-NEXT:    srli a3, a0, 2
-; RV32ZBP-NEXT:    lui a4, 209715
-; RV32ZBP-NEXT:    addi a4, a4, 819
-; RV32ZBP-NEXT:    and a3, a3, a4
-; RV32ZBP-NEXT:    or a3, a3, a0
-; RV32ZBP-NEXT:    or a2, a2, a1
-; RV32ZBP-NEXT:    orc2.n a1, a1
+; RV32ZBP-NEXT:    srli a2, a0, 2
+; RV32ZBP-NEXT:    srli a3, a1, 2
+; RV32ZBP-NEXT:    or a3, a3, a1
+; RV32ZBP-NEXT:    or a2, a2, a0
 ; RV32ZBP-NEXT:    orc2.n a0, a0
+; RV32ZBP-NEXT:    orc2.n a1, a1
 ; RV32ZBP-NEXT:    slli a2, a2, 2
 ; RV32ZBP-NEXT:    slli a3, a3, 2
-; RV32ZBP-NEXT:    lui a5, 838861
-; RV32ZBP-NEXT:    addi a5, a5, -820
-; RV32ZBP-NEXT:    and a3, a3, a5
-; RV32ZBP-NEXT:    and a2, a2, a5
+; RV32ZBP-NEXT:    lui a4, 838861
+; RV32ZBP-NEXT:    addi a4, a4, -820
+; RV32ZBP-NEXT:    and a3, a3, a4
+; RV32ZBP-NEXT:    and a2, a2, a4
+; RV32ZBP-NEXT:    srli a4, a1, 2
 ; RV32ZBP-NEXT:    srli a5, a0, 2
-; RV32ZBP-NEXT:    srli a6, a1, 2
-; RV32ZBP-NEXT:    and a6, a6, a4
-; RV32ZBP-NEXT:    and a4, a5, a4
-; RV32ZBP-NEXT:    or a0, a4, a0
-; RV32ZBP-NEXT:    or a1, a6, a1
-; RV32ZBP-NEXT:    or a1, a1, a2
-; RV32ZBP-NEXT:    or a0, a0, a3
+; RV32ZBP-NEXT:    lui a6, 209715
+; RV32ZBP-NEXT:    addi a6, a6, 819
+; RV32ZBP-NEXT:    and a5, a5, a6
+; RV32ZBP-NEXT:    and a4, a4, a6
+; RV32ZBP-NEXT:    or a1, a4, a1
+; RV32ZBP-NEXT:    or a0, a5, a0
+; RV32ZBP-NEXT:    or a0, a0, a2
+; RV32ZBP-NEXT:    or a1, a1, a3
 ; RV32ZBP-NEXT:    ret
   %and1 = shl i64 %a, 2
   %shl1 = and i64 %and1, -3689348814741910324

diff  --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
index 120105cfd14c7..0eb057a3c5bd4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
@@ -6,55 +6,55 @@ define arm_aapcs_vfpcc <4 x i32> @loads_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    .vsave {d9}
-; CHECK-NEXT:    vpush {d9}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-NEXT:    vmov.f32 s8, s6
-; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vand q3, q2, q0
-; CHECK-NEXT:    vand q0, q1, q0
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov r3, lr, d0
+; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
 ; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vmov r4, r1, d6
-; CHECK-NEXT:    vmov r0, r12, d7
-; CHECK-NEXT:    vldrw.u32 q3, [r2]
-; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s8, s14
-; CHECK-NEXT:    vmov.f32 s18, s15
-; CHECK-NEXT:    vmov.f32 s14, s5
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    vmov.f32 s0, s12
-; CHECK-NEXT:    vmov.f32 s6, s13
-; CHECK-NEXT:    adds r2, r5, r4
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    asr.w r6, r5, #31
-; CHECK-NEXT:    adcs r1, r6
-; CHECK-NEXT:    asrl r2, r1, r4
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    adds r6, r1, r3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    asr.w r4, r1, #31
-; CHECK-NEXT:    adc.w r1, r4, lr
-; CHECK-NEXT:    asrl r6, r1, r3
-; CHECK-NEXT:    vmov r5, r4, d1
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
-; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    vmov.f32 s2, s7
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r3, r1, d1
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vand q3, q1, q2
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vmov lr, r12, d7
+; CHECK-NEXT:    vmov.f32 s16, s6
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vand q2, q4, q2
+; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    adcs r5, r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    asrl r0, r5, r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    asrs r4, r2, #31
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    adcs r1, r4
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    asrl r2, r1, r3
+; CHECK-NEXT:    vmov r4, r5, d6
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    adds.w r6, r1, lr
 ; CHECK-NEXT:    asr.w r3, r1, #31
 ; CHECK-NEXT:    adc.w r1, r3, r12
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    asrl r0, r1, r3
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    adds r6, r1, r5
-; CHECK-NEXT:    asr.w r2, r1, #31
-; CHECK-NEXT:    adc.w r1, r2, r4
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    asrl r6, r1, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r6, r0
-; CHECK-NEXT:    vpop {d9}
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    asrl r6, r1, r3
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    adds r4, r4, r1
+; CHECK-NEXT:    asr.w r3, r1, #31
+; CHECK-NEXT:    adc.w r1, r3, r5
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    asrl r4, r1, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r2
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %a = load <4 x i32>, <4 x i32> *%A, align 4
@@ -142,56 +142,56 @@ define arm_aapcs_vfpcc void @load_store_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d9}
-; CHECK-NEXT:    vpush {d9}
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-NEXT:    vmov.f32 s8, s6
-; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vand q3, q2, q0
-; CHECK-NEXT:    vand q1, q1, q0
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov r4, lr, d2
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vmov.i64 q4, #0xffffffff
 ; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov r5, r1, d6
-; CHECK-NEXT:    vmov r0, r12, d7
-; CHECK-NEXT:    vldrw.u32 q3, [r2]
-; CHECK-NEXT:    vmov.f32 s10, s3
-; CHECK-NEXT:    vmov.f32 s8, s14
-; CHECK-NEXT:    vmov.f32 s18, s15
-; CHECK-NEXT:    vmov.f32 s14, s1
-; CHECK-NEXT:    vmov r6, s4
-; CHECK-NEXT:    vmov.f32 s4, s12
-; CHECK-NEXT:    vmov.f32 s2, s13
+; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    vand q2, q0, q4
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vand q1, q1, q4
+; CHECK-NEXT:    vmov r5, r1, d3
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov r0, r12, d2
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vmov r4, lr, d5
+; CHECK-NEXT:    vmov.f32 s20, s6
+; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s22, s7
+; CHECK-NEXT:    vand q4, q5, q4
+; CHECK-NEXT:    vmov r6, s2
+; CHECK-NEXT:    vmov.f32 s2, s5
 ; CHECK-NEXT:    adds r2, r6, r5
-; CHECK-NEXT:    vmov r5, s8
+; CHECK-NEXT:    vmov r5, s18
 ; CHECK-NEXT:    asr.w r7, r6, #31
 ; CHECK-NEXT:    adcs r1, r7
 ; CHECK-NEXT:    asrl r2, r1, r5
-; CHECK-NEXT:    vmov r7, s4
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r7, s2
+; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    adds r4, r4, r1
 ; CHECK-NEXT:    asr.w r5, r1, #31
 ; CHECK-NEXT:    adc.w r1, r5, lr
 ; CHECK-NEXT:    asrl r4, r1, r7
-; CHECK-NEXT:    vmov r6, r5, d3
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmov q1[2], q1[0], r4, r2
+; CHECK-NEXT:    vmov r6, r5, d4
+; CHECK-NEXT:    vmov r1, s12
 ; CHECK-NEXT:    adds r0, r0, r1
 ; CHECK-NEXT:    asr.w r7, r1, #31
 ; CHECK-NEXT:    adc.w r1, r7, r12
-; CHECK-NEXT:    vmov r7, s18
+; CHECK-NEXT:    vmov r7, s16
 ; CHECK-NEXT:    asrl r0, r1, r7
-; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    adds r6, r6, r1
-; CHECK-NEXT:    asr.w r2, r1, #31
-; CHECK-NEXT:    adc.w r1, r2, r5
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    asrl r6, r1, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r6, r0
-; CHECK-NEXT:    vstrw.32 q1, [r3]
-; CHECK-NEXT:    vpop {d9}
+; CHECK-NEXT:    asr.w r7, r1, #31
+; CHECK-NEXT:    adc.w r1, r7, r5
+; CHECK-NEXT:    vmov r7, s4
+; CHECK-NEXT:    asrl r6, r1, r7
+; CHECK-NEXT:    vmov q0[2], q0[0], r6, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r2
+; CHECK-NEXT:    vstrw.32 q0, [r3]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
@@ -276,8 +276,8 @@ entry:
 define arm_aapcs_vfpcc void @load_one_store_i32(<4 x i32> *%A, <4 x i32> *%D) {
 ; CHECK-LABEL: load_one_store_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s2, s3
@@ -285,27 +285,27 @@ define arm_aapcs_vfpcc void @load_one_store_i32(<4 x i32> *%A, <4 x i32> *%D) {
 ; CHECK-NEXT:    vmov.f32 s2, s1
 ; CHECK-NEXT:    adds.w r12, r2, r2
 ; CHECK-NEXT:    asr.w r3, r2, #31
-; CHECK-NEXT:    adc.w r7, r3, r2, asr #31
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    asrl r12, r7, r2
-; CHECK-NEXT:    adds r0, r3, r3
-; CHECK-NEXT:    asr.w r5, r3, #31
-; CHECK-NEXT:    adc.w r5, r5, r3, asr #31
-; CHECK-NEXT:    asrl r0, r5, r3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    adds r4, r3, r3
-; CHECK-NEXT:    asr.w r5, r3, #31
-; CHECK-NEXT:    adc.w r5, r5, r3, asr #31
-; CHECK-NEXT:    asrl r4, r5, r3
-; CHECK-NEXT:    vmov q1[2], q1[0], r4, r0
-; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    adc.w r3, r3, r2, asr #31
+; CHECK-NEXT:    asrl r12, r3, r2
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    adds r2, r3, r3
+; CHECK-NEXT:    asr.w r0, r3, #31
+; CHECK-NEXT:    adc.w r5, r0, r3, asr #31
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    asrl r2, r5, r3
 ; CHECK-NEXT:    adds r4, r0, r0
-; CHECK-NEXT:    asr.w r2, r0, #31
-; CHECK-NEXT:    adc.w r3, r2, r0, asr #31
+; CHECK-NEXT:    asr.w r3, r0, #31
+; CHECK-NEXT:    adc.w r3, r3, r0, asr #31
 ; CHECK-NEXT:    asrl r4, r3, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r12
-; CHECK-NEXT:    vstrw.32 q1, [r1]
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    adds r6, r0, r0
+; CHECK-NEXT:    asr.w r3, r0, #31
+; CHECK-NEXT:    adc.w r3, r3, r0, asr #31
+; CHECK-NEXT:    asrl r6, r3, r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r6, r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %a = load <4 x i32>, <4 x i32> *%A, align 4
   %sa = sext <4 x i32> %a to <4 x i64>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
index f412204993ea2..879351b07b425 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
@@ -180,44 +180,44 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: ext_add_ashr_trunc_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vmov.f32 s12, s6
 ; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
 ; CHECK-NEXT:    vmov.f32 s6, s5
 ; CHECK-NEXT:    vmov.f32 s14, s7
 ; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    vmov r3, r7, d2
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov r0, r1, d6
 ; CHECK-NEXT:    vmov.f32 s2, s3
-; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov r12, lr, d7
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    asr.w r5, r4, #31
+; CHECK-NEXT:    vmov lr, r12, d7
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    asrs r5, r2, #31
+; CHECK-NEXT:    adds r2, r2, r0
+; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    adcs r1, r5
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    lsrl r2, r1, #1
+; CHECK-NEXT:    asrs r1, r0, #31
+; CHECK-NEXT:    adds.w r0, r0, lr
+; CHECK-NEXT:    adc.w r1, r1, r12
+; CHECK-NEXT:    asrs r4, r5, #31
+; CHECK-NEXT:    adds r6, r5, r3
+; CHECK-NEXT:    vmov r3, r5, d3
+; CHECK-NEXT:    vmov.f32 s6, s1
 ; CHECK-NEXT:    lsrl r0, r1, #1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    adds r2, r2, r1
-; CHECK-NEXT:    asr.w r4, r1, #31
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    lsrl r2, r3, #1
-; CHECK-NEXT:    vmov r1, r5, d3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    adds.w r4, r3, r12
-; CHECK-NEXT:    asr.w r6, r3, #31
-; CHECK-NEXT:    adc.w r3, r6, lr
-; CHECK-NEXT:    asrs r2, r0, #31
-; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adcs r7, r4
+; CHECK-NEXT:    lsrl r6, r7, #1
+; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    adds r6, r1, r3
+; CHECK-NEXT:    asr.w r2, r1, #31
 ; CHECK-NEXT:    adc.w r1, r2, r5
-; CHECK-NEXT:    lsrl r4, r3, #1
-; CHECK-NEXT:    lsrl r0, r1, #1
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r4
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    lsrl r6, r1, #1
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r0
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %sa = sext <4 x i32> %a to <4 x i64>
   %sb = zext <4 x i32> %b to <4 x i64>
@@ -328,107 +328,98 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: ext_ops_trunc_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s16, s2
-; CHECK-NEXT:    vmov.i64 q3, #0xffffffff
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    vmov.f32 s8, s2
 ; CHECK-NEXT:    vmov.f32 s2, s3
-; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vand q2, q2, q3
-; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vmov r1, r7, d4
-; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vmov r2, r12, d5
-; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vmov r10, s8
+; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov r6, s2
 ; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    adds r0, r3, r1
-; CHECK-NEXT:    asr.w r5, r3, #31
-; CHECK-NEXT:    adcs r5, r7
-; CHECK-NEXT:    asrl r0, r5, r1
-; CHECK-NEXT:    subs.w lr, r0, r1
-; CHECK-NEXT:    asr.w r0, r6, #31
-; CHECK-NEXT:    sbc.w r8, r5, r7
-; CHECK-NEXT:    adds r4, r6, r2
-; CHECK-NEXT:    adc.w r5, r0, r12
-; CHECK-NEXT:    movs r7, #0
-; CHECK-NEXT:    asrl r4, r5, r2
+; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    asr.w r0, r10, #31
+; CHECK-NEXT:    asrs r7, r6, #31
+; CHECK-NEXT:    adds.w r4, r10, r2
+; CHECK-NEXT:    adc r3, r0, #0
+; CHECK-NEXT:    asrl r4, r3, r2
 ; CHECK-NEXT:    subs r0, r4, r2
-; CHECK-NEXT:    sbc.w r5, r5, r12
-; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    umull r0, r4, r0, r2
-; CHECK-NEXT:    mla r5, r5, r2, r4
-; CHECK-NEXT:    eor.w r4, r3, r1
-; CHECK-NEXT:    orr.w r4, r4, r3, asr #31
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    csetm r4, eq
-; CHECK-NEXT:    bfi r7, r4, #0, #8
-; CHECK-NEXT:    eor.w r4, r6, r2
-; CHECK-NEXT:    orr.w r4, r4, r6, asr #31
-; CHECK-NEXT:    rsbs r6, r6, #0
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    lsll r0, r5, r6
-; CHECK-NEXT:    csetm r4, eq
-; CHECK-NEXT:    lsll r0, r5, r2
-; CHECK-NEXT:    bfi r7, r4, #8, #8
-; CHECK-NEXT:    rsbs r2, r3, #0
+; CHECK-NEXT:    sbc lr, r3, #0
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    umull r0, r8, r0, r2
+; CHECK-NEXT:    adds r4, r6, r3
+; CHECK-NEXT:    eor.w r1, r6, r3
+; CHECK-NEXT:    adc r5, r7, #0
+; CHECK-NEXT:    eor.w r7, r10, r2
+; CHECK-NEXT:    asrl r4, r5, r3
+; CHECK-NEXT:    orr.w r7, r7, r10, asr #31
+; CHECK-NEXT:    subs r4, r4, r3
+; CHECK-NEXT:    orr.w r1, r1, r6, asr #31
+; CHECK-NEXT:    sbc r5, r5, #0
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    umull r4, r12, r4, r3
+; CHECK-NEXT:    csetm r9, eq
+; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    bfi r7, r9, #0, #8
+; CHECK-NEXT:    csetm r1, eq
+; CHECK-NEXT:    bfi r7, r1, #8, #8
+; CHECK-NEXT:    mla r5, r5, r3, r12
+; CHECK-NEXT:    rsbs r1, r6, #0
 ; CHECK-NEXT:    vmsr p0, r7
-; CHECK-NEXT:    umull r4, r7, lr, r1
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    mla r7, r8, r1, r7
-; CHECK-NEXT:    lsll r4, r7, r2
-; CHECK-NEXT:    vmov r2, lr, d3
-; CHECK-NEXT:    lsll r4, r7, r1
-; CHECK-NEXT:    vmov r1, r7, d2
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r0
-; CHECK-NEXT:    vpsel q2, q4, q2
-; CHECK-NEXT:    asrs r0, r3, #31
-; CHECK-NEXT:    adds r4, r3, r1
-; CHECK-NEXT:    adc.w r5, r0, r7
-; CHECK-NEXT:    asrl r4, r5, r1
-; CHECK-NEXT:    subs r0, r4, r1
-; CHECK-NEXT:    sbc.w r7, r5, r7
-; CHECK-NEXT:    umull r0, r4, r0, r1
-; CHECK-NEXT:    mla r9, r7, r1, r4
-; CHECK-NEXT:    vmov r7, s2
-; CHECK-NEXT:    adds r6, r7, r2
-; CHECK-NEXT:    asr.w r4, r7, #31
-; CHECK-NEXT:    adc.w r5, r4, lr
-; CHECK-NEXT:    asrl r6, r5, r2
-; CHECK-NEXT:    subs r4, r6, r2
-; CHECK-NEXT:    sbc.w r6, r5, lr
-; CHECK-NEXT:    eor.w r5, r3, r1
-; CHECK-NEXT:    orr.w r5, r5, r3, asr #31
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    lsll r0, r9, r3
-; CHECK-NEXT:    csetm r5, eq
-; CHECK-NEXT:    rsbs r3, r7, #0
-; CHECK-NEXT:    bfi r12, r5, #0, #8
-; CHECK-NEXT:    eor.w r5, r7, r2
-; CHECK-NEXT:    orr.w r5, r5, r7, asr #31
-; CHECK-NEXT:    lsll r0, r9, r1
-; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csetm r5, eq
-; CHECK-NEXT:    bfi r12, r5, #8, #8
-; CHECK-NEXT:    umull r4, r5, r4, r2
-; CHECK-NEXT:    vmsr p0, r12
-; CHECK-NEXT:    mla r5, r6, r2, r5
+; CHECK-NEXT:    mla r7, lr, r2, r8
+; CHECK-NEXT:    lsll r4, r5, r1
+; CHECK-NEXT:    rsb.w r1, r10, #0
+; CHECK-NEXT:    lsll r0, r7, r1
+; CHECK-NEXT:    vmov lr, s2
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    lsll r0, r7, r2
 ; CHECK-NEXT:    lsll r4, r5, r3
-; CHECK-NEXT:    lsll r4, r5, r2
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r4
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r4
+; CHECK-NEXT:    vpsel q2, q3, q2
+; CHECK-NEXT:    adds.w r2, lr, r1
+; CHECK-NEXT:    asr.w r0, lr, #31
+; CHECK-NEXT:    adc r3, r0, #0
+; CHECK-NEXT:    asrl r2, r3, r1
+; CHECK-NEXT:    subs r0, r2, r1
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    sbc r7, r3, #0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    umull r0, r6, r0, r1
+; CHECK-NEXT:    asrs r5, r2, #31
+; CHECK-NEXT:    adds r4, r2, r3
+; CHECK-NEXT:    adc r5, r5, #0
+; CHECK-NEXT:    asrl r4, r5, r3
+; CHECK-NEXT:    subs r4, r4, r3
+; CHECK-NEXT:    sbc r8, r5, #0
+; CHECK-NEXT:    mla r5, r7, r1, r6
+; CHECK-NEXT:    eor.w r6, lr, r1
+; CHECK-NEXT:    orr.w r6, r6, lr, asr #31
+; CHECK-NEXT:    eor.w r7, r2, r3
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    orr.w r7, r7, r2, asr #31
+; CHECK-NEXT:    csetm r6, eq
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    csetm r7, eq
+; CHECK-NEXT:    rsb.w lr, lr, #0
+; CHECK-NEXT:    bfi r12, r7, #0, #8
+; CHECK-NEXT:    lsll r0, r5, lr
+; CHECK-NEXT:    bfi r12, r6, #8, #8
+; CHECK-NEXT:    umull r4, r6, r4, r3
+; CHECK-NEXT:    lsll r0, r5, r1
+; CHECK-NEXT:    rsbs r1, r2, #0
+; CHECK-NEXT:    vmsr p0, r12
+; CHECK-NEXT:    mla r7, r8, r3, r6
+; CHECK-NEXT:    lsll r4, r7, r1
+; CHECK-NEXT:    lsll r4, r7, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vmov.f32 s3, s10
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 entry:
   %sa = sext <4 x i32> %a to <4 x i64>
   %sb = zext <4 x i32> %b to <4 x i64>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
index 7a62d6d148167..9c283fb6298ed 100644
--- a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
@@ -57,19 +57,19 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpt.s32 lt, q0, zr
 ; CHECK-NEXT:    vldrwt.u32 q5, [r0]
-; CHECK-NEXT:    vmov.f32 s2, s21
+; CHECK-NEXT:    vmov.f32 s2, s23
+; CHECK-NEXT:    vmov.f32 s16, s22
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    bl __aeabi_l2d
-; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vmov d9, r0, r1
 ; CHECK-NEXT:    asrs r3, r2, #31
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl __aeabi_l2d
-; CHECK-NEXT:    vmov.f32 s2, s23
+; CHECK-NEXT:    vmov.f32 s2, s21
 ; CHECK-NEXT:    vmov d8, r0, r1
-; CHECK-NEXT:    vmov.f32 s20, s22
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    asrs r3, r2, #31
 ; CHECK-NEXT:    mov r0, r2
@@ -82,8 +82,8 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
 ; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl __aeabi_l2d
 ; CHECK-NEXT:    vmov d10, r0, r1
-; CHECK-NEXT:    vmov q0, q4
-; CHECK-NEXT:    vmov q1, q5
+; CHECK-NEXT:    vmov q1, q4
+; CHECK-NEXT:    vmov q0, q5
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
index ac4c6566ee414..9819a8253f345 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
@@ -401,26 +401,26 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    subs r4, r4, r6
 ; CHECK-NEXT:    sbc.w r9, r3, r6, asr #31
 ; CHECK-NEXT:    vmov r6, s8
+; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    subs r5, r7, r6
+; CHECK-NEXT:    asr.w r7, r7, #31
 ; CHECK-NEXT:    vmov q2[2], q2[0], r5, r8
-; CHECK-NEXT:    asr.w r5, r7, #31
-; CHECK-NEXT:    sbc.w r5, r5, r6, asr #31
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    vmov r7, s6
-; CHECK-NEXT:    subs r3, r7, r6
-; CHECK-NEXT:    vmov q2[3], q2[1], r4, r3
-; CHECK-NEXT:    asr.w r3, r5, #31
-; CHECK-NEXT:    mov.w r4, #0
-; CHECK-NEXT:    bfi r4, r3, #0, #4
-; CHECK-NEXT:    asr.w r3, r9, #31
-; CHECK-NEXT:    bfi r4, r3, #4, #4
-; CHECK-NEXT:    asr.w r3, r12, #31
-; CHECK-NEXT:    bfi r4, r3, #8, #4
-; CHECK-NEXT:    asr.w r3, r7, #31
-; CHECK-NEXT:    sbc.w r3, r3, r6, asr #31
+; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    sbc.w r6, r7, r6, asr #31
+; CHECK-NEXT:    asrs r6, r6, #31
+; CHECK-NEXT:    subs r7, r3, r5
+; CHECK-NEXT:    asr.w r3, r3, #31
+; CHECK-NEXT:    vmov q2[3], q2[1], r4, r7
+; CHECK-NEXT:    mov.w r7, #0
+; CHECK-NEXT:    sbc.w r3, r3, r5, asr #31
+; CHECK-NEXT:    bfi r7, r6, #0, #4
+; CHECK-NEXT:    asr.w r4, r9, #31
+; CHECK-NEXT:    asr.w r6, r12, #31
+; CHECK-NEXT:    bfi r7, r4, #4, #4
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    bfi r4, r3, #12, #4
-; CHECK-NEXT:    vmsr p0, r4
+; CHECK-NEXT:    bfi r7, r6, #8, #4
+; CHECK-NEXT:    bfi r7, r3, #12, #4
+; CHECK-NEXT:    vmsr p0, r7
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vsubt.i32 q2, q0, q2
 ; CHECK-NEXT:    vstrb.8 q2, [r2], #16

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
index f66eb8584a0bd..217caeebe6335 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
@@ -232,34 +232,33 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov.f32 s4, s1
+; CHECK-NEXT:    vmov.f32 s6, s3
 ; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.f32 s4, s5
-; CHECK-NEXT:    vmov.f32 s6, s7
-; CHECK-NEXT:    umull lr, r12, r1, r0
+; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    umull r2, r5, r3, r0
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    umull lr, r12, r1, r0
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, lr
 ; CHECK-NEXT:    asrs r2, r0, #31
 ; CHECK-NEXT:    mla r4, r1, r2, r12
 ; CHECK-NEXT:    asrs r1, r1, #31
 ; CHECK-NEXT:    mla r5, r3, r2, r5
 ; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    mla r1, r1, r0, r4
-; CHECK-NEXT:    vmov r4, s4
 ; CHECK-NEXT:    mla r3, r3, r0, r5
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    umull r5, lr, r4, r0
-; CHECK-NEXT:    umull r3, r12, r1, r0
-; CHECK-NEXT:    vmov q1[2], q1[0], r5, r3
-; CHECK-NEXT:    mla r3, r1, r2, r12
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    umull r3, r5, r1, r0
+; CHECK-NEXT:    mla r5, r1, r2, r5
 ; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    mla r2, r4, r2, lr
-; CHECK-NEXT:    mla r1, r1, r0, r3
-; CHECK-NEXT:    asrs r3, r4, #31
-; CHECK-NEXT:    mla r0, r3, r0, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r1
+; CHECK-NEXT:    mla r12, r1, r0, r5
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    umull r4, r1, r5, r0
+; CHECK-NEXT:    mla r1, r5, r2, r1
+; CHECK-NEXT:    asrs r2, r5, #31
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT:    mla r0, r2, r0, r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -276,34 +275,33 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmov.f32 s4, s1
 ; CHECK-NEXT:    asrs r4, r0, #31
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov.f32 s6, s3
 ; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.f32 s4, s5
-; CHECK-NEXT:    vmov.f32 s6, s7
-; CHECK-NEXT:    umull lr, r12, r0, r1
+; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    umull r2, r5, r0, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    umull lr, r12, r0, r1
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, lr
 ; CHECK-NEXT:    asrs r2, r1, #31
 ; CHECK-NEXT:    mla r2, r0, r2, r12
 ; CHECK-NEXT:    mla r1, r4, r1, r2
 ; CHECK-NEXT:    asrs r2, r3, #31
 ; CHECK-NEXT:    mla r2, r0, r2, r5
-; CHECK-NEXT:    vmov r5, s4
 ; CHECK-NEXT:    mla r2, r4, r3, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    umull r3, lr, r0, r5
-; CHECK-NEXT:    umull r2, r12, r0, r1
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    asrs r2, r1, #31
-; CHECK-NEXT:    mla r2, r0, r2, r12
-; CHECK-NEXT:    mla r1, r4, r1, r2
-; CHECK-NEXT:    asrs r2, r5, #31
-; CHECK-NEXT:    mla r0, r0, r2, lr
-; CHECK-NEXT:    mla r0, r4, r5, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r1
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    umull r2, r3, r0, r1
+; CHECK-NEXT:    asrs r5, r1, #31
+; CHECK-NEXT:    mla r3, r0, r5, r3
+; CHECK-NEXT:    mla r12, r4, r1, r3
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    umull r5, r1, r0, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r2
+; CHECK-NEXT:    asrs r2, r3, #31
+; CHECK-NEXT:    mla r0, r0, r2, r1
+; CHECK-NEXT:    mla r0, r4, r3, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 4b28c2b07cacc..8bc247d9ebaf3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -8,21 +8,18 @@ define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    ldrd lr, r12, [r0]
-; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
+; CHECK-NEXT:    ldrd r12, r3, [r0]
+; CHECK-NEXT:    ldrd lr, r2, [r0, #8]
 ; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
-; CHECK-NEXT:    vmov q1[2], q1[0], lr, r3
-; CHECK-NEXT:    vmov q1[3], q1[1], r12, r2
-; CHECK-NEXT:    vmov.32 q0[0], r4
-; CHECK-NEXT:    vmov.f32 s8, s7
-; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r12, lr
+; CHECK-NEXT:    strd r2, r0, [r1, #16]
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
+; CHECK-NEXT:    vmov.f32 s8, s4
 ; CHECK-NEXT:    vmov.f32 s9, s6
 ; CHECK-NEXT:    vmov.f32 s10, s0
 ; CHECK-NEXT:    vmov.f32 s11, s5
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov.f32 s8, s4
 ; CHECK-NEXT:    vstrw.32 q2, [r1]
-; CHECK-NEXT:    strd r2, r0, [r1, #16]
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0

diff  --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll
index 987fa7732e424..2f6576f29d0ac 100644
--- a/llvm/test/CodeGen/X86/combine-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll
@@ -349,19 +349,17 @@ define i64 @test_bitreverse_shli_bitreverse_i64(i64 %a) nounwind {
 ; X64-LABEL: test_bitreverse_shli_bitreverse_i64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    bswapq %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shrq $4, %rax
-; X64-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rcx, %rax
-; X64-NEXT:    andq %rcx, %rdi
-; X64-NEXT:    shlq $4, %rdi
-; X64-NEXT:    orq %rax, %rdi
-; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    andq %rax, %rcx
-; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    leaq (%rdi,%rcx,4), %rax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    shrl $4, %edi
+; X64-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X64-NEXT:    orl %eax, %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X64-NEXT:    shrl $2, %edi
+; X64-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $357913941, %ecx # imm = 0x15555555
 ; X64-NEXT:    shrl %eax

diff  --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll
index c7844698f8693..ec73b64b993b9 100644
--- a/llvm/test/CodeGen/X86/dagcombine-cse.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll
@@ -50,55 +50,59 @@ define i96 @square_high(i96 %x) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    setb %al
+; X86-NEXT:    movzbl %al, %ecx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    addb $255, %cl
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    adcl %edx, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %eax
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
-; X86-NEXT:    adcl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    adcl %ebp, %esi
-; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    setb %ah
+; X86-NEXT:    addb $255, %al
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    movzbl %ah, %ebx
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    adcl %ebp, %esi
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %eax
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    adcl %edx, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index ddd6f002a0992..f9b3a19889e5f 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -556,18 +556,16 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl $1, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm2
 ; X86-SSE2-NEXT:    pslld $23, %xmm1
 ; X86-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
@@ -583,18 +581,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwi
 ;
 ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movl $1, %eax
-; X64-SSE2-NEXT:    movd %eax, %xmm2
 ; X64-SSE2-NEXT:    pslld $23, %xmm1
 ; X64-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
-; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-SSE2-NEXT:    pand %xmm1, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm1, %xmm1
 ; X64-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
 ; X64-SSE2-NEXT:    retq
@@ -654,18 +650,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi
 define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl $1, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm2
 ; X86-SSE2-NEXT:    pslld $23, %xmm1
 ; X86-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
@@ -681,18 +675,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi
 ;
 ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movl $1, %eax
-; X64-SSE2-NEXT:    movd %eax, %xmm2
 ; X64-SSE2-NEXT:    pslld $23, %xmm1
 ; X64-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
-; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-SSE2-NEXT:    pand %xmm1, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm1, %xmm1
 ; X64-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
 ; X64-SSE2-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index 6dfc15ed38e75..cb5d3b0ac21c0 100644
--- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -117,7 +117,7 @@ define void @i56_or(ptr %a) {
 ; X64-NEXT:    movzwl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 6(%rdi), %ecx
 ; X64-NEXT:    movb %cl, 6(%rdi)
-; X64-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
 ; X64-NEXT:    shll $16, %ecx
 ; X64-NEXT:    orl %eax, %ecx
 ; X64-NEXT:    shlq $32, %rcx
@@ -149,7 +149,7 @@ define void @i56_and_or(ptr %a) {
 ; X64-NEXT:    movzwl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 6(%rdi), %ecx
 ; X64-NEXT:    movb %cl, 6(%rdi)
-; X64-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
 ; X64-NEXT:    shll $16, %ecx
 ; X64-NEXT:    orl %eax, %ecx
 ; X64-NEXT:    shlq $32, %rcx
@@ -187,19 +187,18 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) {
 ; X64-NEXT:    movzwl 4(%rdi), %ecx
 ; X64-NEXT:    movzbl 6(%rdi), %edx
 ; X64-NEXT:    movb %dl, 6(%rdi)
-; X64-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; X64-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-NEXT:    shll $16, %edx
 ; X64-NEXT:    orl %ecx, %edx
 ; X64-NEXT:    shlq $32, %rdx
 ; X64-NEXT:    movl (%rdi), %ecx
 ; X64-NEXT:    orq %rdx, %rcx
 ; X64-NEXT:    shlq $13, %rax
-; X64-NEXT:    movabsq $72057594037919743, %rdx # imm = 0xFFFFFFFFFFDFFF
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    orq %rax, %rdx
-; X64-NEXT:    movl %edx, (%rdi)
-; X64-NEXT:    shrq $32, %rdx
-; X64-NEXT:    movw %dx, 4(%rdi)
+; X64-NEXT:    andq $-8193, %rcx # imm = 0xDFFF
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    movl %ecx, (%rdi)
+; X64-NEXT:    shrq $32, %rcx
+; X64-NEXT:    movw %cx, 4(%rdi)
 ; X64-NEXT:    retq
   %extbit = zext i1 %bit to i56
   %b = load i56, ptr %a, align 1

diff  --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
index ecbdc2d91deae..8199c68616a9f 100644
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -191,16 +191,17 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $188, %esp
+; X86-NEXT:    subl $184, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    andl $1, %ebp
-; X86-NEXT:    negl %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -208,8 +209,9 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %eax, %ecx
@@ -229,18 +231,18 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %eax, %edi
@@ -269,139 +271,140 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
 ; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebp, %ecx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %bl, %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -447,113 +450,118 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X86-NEXT:    adcl %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    adcl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    adcl %esi, %eax
 ; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    addl %ebp, %edi
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl %edx, %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movzbl %al, %esi
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    movzbl %al, %edi
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    addl %eax, %edi
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    addl %ebp, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl %ebx, %ebp
 ; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
@@ -566,55 +574,54 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb %al
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movzbl %al, %edi
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movzbl %al, %ebp
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    adcl %ebp, %edi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl %ebx, %ebp
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull %eax, %edx
@@ -628,36 +635,38 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull %eax, %edx
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    imull %ebx
 ; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    adcl %edx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    adcl %ebp, %edx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %edi
@@ -679,127 +688,127 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    adcl %ebp, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %ebx
-; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl %ecx, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl %edx, %esi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    adcl %edx, %edi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    adcl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    imull %edx, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    imull %edx, %ecx
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    imull %edx, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    imull %edx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %eax
+; X86-NEXT:    adcl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    xorl %ecx, %ebp
-; X86-NEXT:    orl %eax, %ebp
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    orl %ebp, %esi
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    orl %eax, %ebx
 ; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    xorl %ecx, %edx
 ; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %edi, %ecx
 ; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    andl $1, %edx
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    negl %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    xorl %eax, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %ebp, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    xorl %eax, %ebx
 ; X86-NEXT:    orl %esi, %ebx
@@ -817,7 +826,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movb %dl, 16(%eax)
 ; X86-NEXT:    setne 20(%eax)
-; X86-NEXT:    addl $188, %esp
+; X86-NEXT:    addl $184, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll b/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll
index a4d2b7295af62..7c1efa7c8b48a 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll
@@ -12,14 +12,13 @@ declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)
 define i64 @func() nounwind {
 ; X64-LABEL: func:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $2, %ecx
-; X64-NEXT:    movl $3, %eax
-; X64-NEXT:    imulq %rcx
-; X64-NEXT:    cmpq $2, %rdx
+; X64-NEXT:    movl $2, %eax
+; X64-NEXT:    negq %rax
 ; X64-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
 ; X64-NEXT:    movl $1, %ecx
 ; X64-NEXT:    cmovgeq %rax, %rcx
-; X64-NEXT:    cmpq $-2, %rdx
+; X64-NEXT:    movq $-2, %rax
+; X64-NEXT:    negq %rax
 ; X64-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
 ; X64-NEXT:    cmovgeq %rcx, %rax
 ; X64-NEXT:    retq
@@ -42,16 +41,15 @@ define i64 @func2() nounwind {
 define i64 @func3() nounwind {
 ; X64-LABEL: func3:
 ; X64:       # %bb.0:
-; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    movl $2, %edx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    imulq %rdx
-; X64-NEXT:    cmpq $2, %rdx
-; X64-NEXT:    movabsq $4611686018427387903, %rsi # imm = 0x3FFFFFFFFFFFFFFF
-; X64-NEXT:    cmovgeq %rcx, %rsi
-; X64-NEXT:    cmpq $-2, %rdx
+; X64-NEXT:    movl $2, %eax
+; X64-NEXT:    negq %rax
+; X64-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    movabsq $4611686018427387903, %rcx # imm = 0x3FFFFFFFFFFFFFFF
+; X64-NEXT:    cmovgeq %rax, %rcx
+; X64-NEXT:    movq $-2, %rax
+; X64-NEXT:    negq %rax
 ; X64-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; X64-NEXT:    cmovgeq %rsi, %rax
+; X64-NEXT:    cmovgeq %rcx, %rax
 ; X64-NEXT:    retq
   %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 2)
   ret i64 %tmp
@@ -60,16 +58,15 @@ define i64 @func3() nounwind {
 define i64 @func4() nounwind {
 ; X64-LABEL: func4:
 ; X64:       # %bb.0:
-; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    movl $2, %edx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    imulq %rdx
-; X64-NEXT:    cmpq $2147483647, %rdx # imm = 0x7FFFFFFF
-; X64-NEXT:    movl $4294967295, %esi # imm = 0xFFFFFFFF
-; X64-NEXT:    cmovgq %rcx, %rsi
-; X64-NEXT:    cmpq $-2147483648, %rdx # imm = 0x80000000
+; X64-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    negq %rax
+; X64-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
+; X64-NEXT:    cmovgq %rax, %rcx
+; X64-NEXT:    movq $-2147483648, %rax # imm = 0x80000000
+; X64-NEXT:    negq %rax
 ; X64-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; X64-NEXT:    cmovgeq %rsi, %rax
+; X64-NEXT:    cmovgeq %rcx, %rax
 ; X64-NEXT:    retq
   %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 32)
   ret i64 %tmp
@@ -78,18 +75,15 @@ define i64 @func4() nounwind {
 define i64 @func5() nounwind {
 ; X64-LABEL: func5:
 ; X64:       # %bb.0:
-; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    movl $2, %edx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    imulq %rdx
 ; X64-NEXT:    movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF
-; X64-NEXT:    cmpq %rax, %rdx
-; X64-NEXT:    movl $1, %esi
-; X64-NEXT:    cmovgq %rcx, %rsi
+; X64-NEXT:    negq %rax
+; X64-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    movl $1, %ecx
+; X64-NEXT:    cmovgq %rax, %rcx
 ; X64-NEXT:    movabsq $-4611686018427387904, %rax # imm = 0xC000000000000000
-; X64-NEXT:    cmpq %rax, %rdx
+; X64-NEXT:    negq %rax
 ; X64-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; X64-NEXT:    cmovgeq %rsi, %rax
+; X64-NEXT:    cmovgeq %rcx, %rax
 ; X64-NEXT:    retq
   %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 63)
   ret i64 %tmp

diff  --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index a80931bfaa836..a5ab87f744cde 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -558,12 +558,11 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,268435456,1]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -572,7 +571,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pandn %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo:
@@ -648,19 +647,18 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <2147483648,u,268435456,u>
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,268435456,2147483648]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
@@ -1135,7 +1133,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <3067833783,u,1,u>
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3067833783,3067833783,1,3067833783]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -1143,19 +1141,18 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = <2147483648,u,2,u>
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2,2147483648]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT:    pmuludq %xmm5, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    por %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pxor %xmm5, %xmm3
+; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
 ; CHECK-SSE2-NEXT:    pxor %xmm3, %xmm1
@@ -1379,12 +1376,11 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,268435456,1]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1393,7 +1389,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pandn %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo:

diff  --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index a9be2a5b9273e..e3477585f48bb 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -163,19 +163,18 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_allones_eq:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
@@ -241,19 +240,18 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_allones_ne:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE2-NEXT:    retq
@@ -479,21 +477,20 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo:
@@ -559,19 +556,18 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
@@ -926,21 +922,20 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN:
@@ -1006,19 +1001,18 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
@@ -1167,21 +1161,20 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo:
@@ -1842,21 +1835,20 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
@@ -1921,21 +1913,20 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one:


        


More information about the llvm-commits mailing list